nf-core · mathysgrapotte · Mar 18, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/bin/json_schema.py b/bin/json_schema.py
@@ -1,10 +1,13 @@
 
 from abc import ABC, abstractmethod
-from typing import Literal 
+from typing import Literal
+from itertools import product
 
 class JsonSchema(ABC):
     """
-    This class helps decode and work on a difened Json schema used by the stimulus pipeline
+    This class helps decode and work on a difened Json schema used by the stimulus pipeline.
+    TODO add Json.schema real library to control that each noise, split have the correct keys associated to them.
+    link -> https://json-schema.org/learn/getting-started-step-by-step#create
     """
     def __init__(self, schema: dict ) -> None:
         self.schema                = schema
@@ -25,10 +28,11 @@ def __init__(self, schema: dict ) -> None:
         # check that inside noise dictionary there are no repeated column_nmae values and return them otherwise send error
         self.column_names = self._check_repeated_column_names()
 
-
         # check that noise dictionary have a coherent number of parameters values in case of column_wise for self.interpret_parmas_mode
-        self.number_culumn_wise_val = self._check_params_schema()
-
+        self.number_culumn_wise_val = self._check_noise_params_schema()
+
+
+
     def _check_repeated_column_names(self) -> list:
         """
         Helper function that ensures that inside noise dictionary there are no column:names repeated values
@@ -51,25 +55,24 @@ def _check_repeated_column_names(self) -> list:
 
 
 
-    def _check_params_schema(self) -> int:
+    def _check_noise_params_schema(self) -> int:
         """
-        Help function to check if the number of values in params in the noise dictionary is consisten among all params.
+        Help function to check if the number of values in params in the noise dictionary is consistent among all params.
         If there is {"NoiserName" : { "params": [{"val1":[0, 1]}], "OtherNoiser" : { "params": [{"val1":[2, 3], "val3":[4]}]}}
         it will raise error because the val3 has only a list of len() 1 instead of 2
         otherwise it resturn the len()
         """
 
-        # in case there is no noise dictionary but a custom one instead or if interpret_params_mode is in all_combinations mode
-        if (not self.noise_arg and self.custom_arg) or self.interpret_params_mode == 'all_combinations' :
-            return None
+        # in case there is no noise dictionary or if interpret_params_mode is in all_combinations mode
+        if not self.noise_arg  or self.interpret_params_mode == 'all_combinations' :
+            return 0
 
         num_params_list = []
         # Iterate through the given dictionary becuse more than one column_name values could be specified for ex.
         for i, col_name_dictionary in enumerate(self.noise_arg):
 
             # take into account that there could be the keyword default
             if col_name_dictionary["params"] == "default":
-                # TODO think what to do in this case
                 continue
 
             # iterate throught the possible multiple parmaeters, some noisers could have more than one parameter flag
@@ -83,7 +86,167 @@ def _check_params_schema(self) -> int:
             return num_params_list[0]
         else:
             raise ValueError(f"Expected the same number of values for all the params under noise value, but received a discordant ammount instead.")
+
+
+
+    def _transform_noise_dict(self):
+        """
+        TODO helper fucntion section
+        """
+        noise_dict = {}
+        for col_name_dictionary in self.noise_arg:
+            # The name: field of a noise: can be either a simlpe string or list of strings, so convert such variable to a list if it's a string, otherwise leave it unchanged
+            noiser_list = [col_name_dictionary['name']] if isinstance(col_name_dictionary['name'], str) else col_name_dictionary['name']
+            # Now get the parametrs or set of parameters associated with each noiser and store bot in a tuple and append to list noiser names associated to a given clumn_name
+            for k, noiser_name in enumerate(noiser_list):
+                # handle the fact that params can have "default" as value and not a list
+                if col_name_dictionary['params'] == "default":
+                    params_to_be_added = "default"
+                else:
+                    params_to_be_added =  col_name_dictionary['params'][k]
+                # handle the case of multiple noiser with same name in the same list associated to the column_name, solution -> create a scheme to modify the name
+                if noise_dict.get(col_name_dictionary["column_name"]) and noiser_name in noise_dict.get(col_name_dictionary["column_name"]) :
+                    # Modify the noiser name already present appending a unique key to it
+                    noiser_name = noiser_name + '-#' + str(k)
+                #noise_dict.setdefault(col_name_dictionary["column_name"], []).append( {noiser_name : params_to_be_added} )
+                noise_dict.setdefault(col_name_dictionary["column_name"], {})[noiser_name] = params_to_be_added
+        return noise_dict
+
+
+    def _generate_cartesian_product_combinations(self, d: dict) -> list:
+        """
+        Helper function for creating cartesian product combinations out of a dictionary.
+        TODO expand explanation
+        """
+        keys = d.keys()
+        value_lists = d.values()
+
+        # Generate Cartesian product of value lists
+        combinations = product(*value_lists)
+
+        # Create dictionaries for each combination
+        result = []
+        for combination in combinations:
+            combined_dict = {}
+            for key, value in zip(keys, combination):
+                nested_dict = {value : d[key][value]}
+                combined_dict.update({key: nested_dict})
+            result.append(combined_dict)
+
+        return result
+
+
+
+    def _handle_parameter_selection(self, d: dict, param_index: int) -> dict:
+        """
+        TODO helper fucntion section
+        """
 
+        for key, param_dict in d.items():
+            # remove the appendix used to handle same noise names for same column_name, this is done in the _transform_noise_dict function, this line does nothing if that key is not present afterall
+            key = key.split('-#')[0]
+            # handle "defualt" as params value returning a empty dict 
+            if param_dict == 'default':
+                return {"name" : key, "params" : {}}
+            else:
+                tmp_param_dict = {}
+                # iterate through the possible multiple parameter otpions
+                for param_name, param_value in param_dict.items():
+                    tmp_param_dict[param_name] = param_value[param_index]
+                return {"name": key, "params": tmp_param_dict}  
+
+
+
+    def noise_column_wise_combination(self) -> list:
+        """
+        works on the self.noise_arg dictionary to compute all column wise combinations for parametrs and noise function specified.
+        The combinations of noisers is all against all, except there can not be two noisers for the same column_name.
+        Combinations of noisers will always include at least one noiser per column_name.
+        example for noisers ->
+
+        column_name : 1                                  column_name : 2
+        name : [noiser1, noiser2]                        name: [othernoiser]
+
+        combinations ->
+            noiser1 - othernoiser
+            noiser2 - othernoiser
+
+        Now this is how noiser functions are selected but for each of the above combination there are as many as there are parameters.
+        Again an example shows it better ->
+
+        column_name : 1                                                 column_name : 2
+        name : [noiser1, noiser2]                                       name: [othernoiser]
+        parameters : [{p1 : [1 ,2 ,3]}, {p1 : [1.5, 2.5, 3.5 ]}]        parameters : [{p1 : [4 ,5 ,6], p2 : [7, 8, 9]}]
+
+        combinations ->
+            noiser1 (p1 = 1) - othernoiser (p1 = 4, p2 = 7)
+            noiser1 (p1 = 2) - othernoiser (p1 = 5, p2 = 8)
+            noiser1 (p1 = 3) - othernoiser (p1 = 6, p2 = 9)
+            noiser2 (p1 = 1.5) - othernoiser (p1 = 4, p2 = 7)
+            noiser2 (p1 = 2.5) - othernoiser (p1 = 5, p2 = 8)
+            noiser2 (p1 = 3.5) - othernoiser (p1 = 6, p2 = 9)
+        """
+
+        # transform noise entry in a nested dictionary, with structure {col_name: { noiser_name : {p1 : [1]} }}
+        noise_as_dict = self._transform_noise_dict()
+
+        # Create cartesian product of noiser names based on the above dictionary
+        noiser_combination_list = self._generate_cartesian_product_combinations(noise_as_dict)
+
+        # for each noiser combination create the column wise selection of parameters associated
+        all_noise_combination = []
+        for noise_combo in noiser_combination_list:
+            # select the parameter iterating through the total number of parameters value fopr each col type 
+            for params_index in range(self.number_culumn_wise_val):
+                noise_list = []
+                for col_name, noise_dict in noise_combo.items():
+                    single_param_dict = self._handle_parameter_selection(noise_dict, params_index)
+                    # add the column_name field to this dictionary
+                    single_param_dict["column_name"] = col_name
+                    # reorder the entries by key alphabetically for readability
+                    sorted_dict = {key: single_param_dict[key] for key in sorted(single_param_dict)}
+                    noise_list.append(sorted_dict)
+                all_noise_combination.append(noise_list)
+        return all_noise_combination
+
+
+
+    def noise_all_combination(self) -> list:
+        """
+        works on the self.noise_arg dictionary to compute all possible combinations of parameters and nboisers in a all against all fashion.
+        """
+
+        # TODO implement this function
+        raise ValueError("the function noise_all_combination for the flag interpret_parmas_mode : all_combinations is not implemented yet ")
 
 
 
+    def split_combination(self) -> list:
+        """
+        TODO add description
+        """
+
+        list_split_comibinations = []
+        # iterate through the split entry and return a list of split possibilities, where each splitter_name has one/set of one parametyers
+        for i, split_dict in enumerate(self.split_arg):
+            # jsut create a new dictionary for each set of params associated to each split_name, basically if a splitter has more than one element in his params: then they should be decoupled so to have each splitter with only one value for params:
+            # if the value of params: is "default" just return the dictionary  with an empty dict as value of params : 
+            if split_dict['params'] == "default":
+                split_dict['params'] = {}
+                list_split_comibinations.append({ "split" : [split_dict]})
+            else:
+                # Get lengths of all lists
+                lengths = {key: len(value) for key, value in split_dict['params'][0].items()}
+
+                # Check if all lengths are the same
+                all_lengths_same = set(lengths.values())
+
+                if len(all_lengths_same) != 1 :
+                    raise ValueError(f"All split params for teh same splitter have to have the same number of elements, this splitter does not: {split_dict['name']}.")
+                else:
+                    # iterate at level of number of params_values 
+                    for params_index in range(list(all_lengths_same)[0]):
+                        # making the split into a dict the _handle_parameter_selection can use
+                        single_param_dict = self._handle_parameter_selection({split_dict['name']: split_dict['params'][0] }, params_index)
+                        list_split_comibinations.append(single_param_dict)
+        return list_split_comibinations
diff --git a/bin/launch_interpret_json.py b/bin/launch_interpret_json.py
@@ -3,15 +3,18 @@
 import argparse
 import json
 from json_schema import JsonSchema
+import os
 
 
 def get_args():
 
-    "get the arguments when using from the commandline"
+    """get the arguments when using from the commandline
+    TODO write help function description"""
 
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')
-
+    parser.add_argument("-d", "--out_dir", type=str, required=True, metavar="DIR", help='The output dir where all he jason are written to. Output Json will be called input_json_nam-#[number].json')
+
     args = parser.parse_args()
     return args
 
@@ -27,30 +30,62 @@ def interpret_json(input_json: dict) -> list:
     # Initialize json schema it checks for correctness of the Json architecture and fields / values
     schema = JsonSchema(input_json)
 
-    #print("\nnoise_configurations :\n", schema.noise_arg, "\n", type(schema.noise_arg))
-    #print("\nsplit_configurations :\n", schema.split_arg, "\n", type(schema.split_arg))
-    #print("\ncustom_configurations :\n", schema.custom_arg, "\n", type(schema.custom_arg))
-    print(schema.number_culumn_wise_val)
-    #print(schema.experiment, schema.interpret_params_mode, schema.column_names)
-
+    # compute all noise combinations
+    # first set right fucntion call based on schema.interpret_params_mode, done like following because if are inefficient
+    # both function output an empty list if there is no noise argument
+    function_call_dict = {"culumn_wise": schema.noise_column_wise_combination, "all_combinations": schema.noise_all_combination}
+    list_noise_combinations = function_call_dict[schema.interpret_params_mode]()
+
+    # compute all split combinations, this will only be all vs all because there is no concept of column_name, it will return empty list if there is no split function
+    list_split_combinations = schema.split_combination()
+
+    # combine split possibilities with noise ones in a all vs all manner, each splitter wil be assigned to each noiser
+    list_of_json_to_write = []
+
+    # Check if both lists are empty
+    if not list_noise_combinations and not list_split_combinations:
+        list_of_json_to_write.append({"experiment": schema.experiment})
+    else:
+        if not list_split_combinations:  # Check if list_split_combinations is empty
+            for noiser_dict in list_noise_combinations:
+                list_of_json_to_write.append({"experiment": schema.experiment, "noise": noiser_dict})
+        else:
+            for splitter_dict in list_split_combinations:
+                if not list_noise_combinations:  # Check if list_noise_combinations is empty
+                    list_of_json_to_write.append({"experiment": schema.experiment, "split": splitter_dict})
+                else:
+                    list_of_json_to_write.append({"experiment": schema.experiment, "noise": noiser_dict, "split": splitter_dict})
+
+    # deal wiht custom if present, in this case nothing at all will be done to the dictionary, it will just be passed as it is
+    for custom_dict in schema.custom_arg :
+        list_of_json_to_write.append(custom_dict)
+
+    return list_of_json_to_write
 
-
 
 
-def main(config_json: str) -> str:
+def main(config_json: str, out_dir_path: str) -> str:
 
     # open and read Json
     config = {}
     with open(config_json, 'r') as in_json:
         config = json.load(in_json)
 
-    # initialize the json scheme class 
-    interpret_json(config)
-
+    # interpret the json
+    list_json = interpret_json(config)
+
+    # write all the resultin json files
+    # Create the directory if it doesn't exist
+    os.makedirs(out_dir_path, exist_ok=True)
 
-
+    # Populate the directory with files containing the single SJon combination
+    for i, elements in enumerate(list_json):
+        suffix = os.path.splitext(os.path.basename(config_json))[0]
+        file_path = os.path.join(out_dir_path, f"{suffix}-#{i+1}.json")
+        with open(file_path, 'w') as file:
+            file.write(f"{elements}\n")
 
 
 if __name__ == "__main__":
     args = get_args()
-    main(args.json)
+    main(args.json, args.out_dir)
diff --git a/examples/pipeline_generated.json b/examples/pipeline_generated.json
@@ -2,25 +2,28 @@
     "experiment": "DnaToFloatExperiment",
     "noise": [
         {
-            "column_name": "input1",
+            "column_name": "inphello:input1:dnaut1",
             "name": "UniformTextMasker",
             "params": {"probability": 0.1}
         },
         {
-            "column_name": "input2",
+            "column_name": "hello:input2:prot",
             "name": "UniformTextMasker",
             "params": {"probability": 0.4}
         },
         {
-            "column_name": "label",
+            "column_name": "hola:label:float",
             "name": "GaussianNoise",
             "params": {"mean": 0.5, "std": 0.1}
         }
     ],
-    "split": [
+    "split": 
         {
             "name": "RandomSplitter",
-            "params": {"split": [0.6, 0.8]}
+
+            "params": {"split": [0.6, 0.2, 0.2]}
+
+
+
         }
-    ]
 }