Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Firstpipe module #35

Merged
merged 12 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 174 additions & 11 deletions bin/json_schema.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@

from abc import ABC, abstractmethod
from typing import Literal
from typing import Literal
from itertools import product

class JsonSchema(ABC):
"""
This class helps decode and work on a difened Json schema used by the stimulus pipeline
This class helps decode and work on a difened Json schema used by the stimulus pipeline.
TODO add Json.schema real library to control that each noise, split have the correct keys associated to them.
link -> https://json-schema.org/learn/getting-started-step-by-step#create
"""
def __init__(self, schema: dict ) -> None:
self.schema = schema
Expand All @@ -25,10 +28,11 @@ def __init__(self, schema: dict ) -> None:
# check that inside noise dictionary there are no repeated column_nmae values and return them otherwise send error
self.column_names = self._check_repeated_column_names()


# check that noise dictionary have a coherent number of parameters values in case of column_wise for self.interpret_parmas_mode
self.number_culumn_wise_val = self._check_params_schema()

self.number_culumn_wise_val = self._check_noise_params_schema()



def _check_repeated_column_names(self) -> list:
"""
Helper function that ensures that inside noise dictionary there are no column:names repeated values
Expand All @@ -51,25 +55,24 @@ def _check_repeated_column_names(self) -> list:



def _check_params_schema(self) -> int:
def _check_noise_params_schema(self) -> int:
"""
Help function to check if the number of values in params in the noise dictionary is consisten among all params.
Help function to check if the number of values in params in the noise dictionary is consistent among all params.
If there is {"NoiserName" : { "params": [{"val1":[0, 1]}], "OtherNoiser" : { "params": [{"val1":[2, 3], "val3":[4]}]}}
it will raise error because the val3 has only a list of len() 1 instead of 2
otherwise it resturn the len()
"""

# in case there is no noise dictionary but a custom one instead or if interpret_params_mode is in all_combinations mode
if (not self.noise_arg and self.custom_arg) or self.interpret_params_mode == 'all_combinations' :
return None
# in case there is no noise dictionary or if interpret_params_mode is in all_combinations mode
if not self.noise_arg or self.interpret_params_mode == 'all_combinations' :
return 0

num_params_list = []
# Iterate through the given dictionary becuse more than one column_name values could be specified for ex.
for i, col_name_dictionary in enumerate(self.noise_arg):

# take into account that there could be the keyword default
if col_name_dictionary["params"] == "default":
# TODO think what to do in this case
continue

# iterate throught the possible multiple parmaeters, some noisers could have more than one parameter flag
Expand All @@ -83,7 +86,167 @@ def _check_params_schema(self) -> int:
return num_params_list[0]
else:
raise ValueError(f"Expected the same number of values for all the params under noise value, but received a discordant ammount instead.")



def _transform_noise_dict(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be transform_noise_dict without "_" since this calls self.

"""
TODO helper fucntion section
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should provide enough documentation with examples

noise_dict = {}
for col_name_dictionary in self.noise_arg:
# The name: field of a noise: can be either a simlpe string or list of strings, so convert such variable to a list if it's a string, otherwise leave it unchanged
noiser_list = [col_name_dictionary['name']] if isinstance(col_name_dictionary['name'], str) else col_name_dictionary['name']
# Now get the parametrs or set of parameters associated with each noiser and store bot in a tuple and append to list noiser names associated to a given clumn_name
for k, noiser_name in enumerate(noiser_list):
# handle the fact that params can have "default" as value and not a list
if col_name_dictionary['params'] == "default":
params_to_be_added = "default"
else:
params_to_be_added = col_name_dictionary['params'][k]
# handle the case of multiple noiser with same name in the same list associated to the column_name, solution -> create a scheme to modify the name
if noise_dict.get(col_name_dictionary["column_name"]) and noiser_name in noise_dict.get(col_name_dictionary["column_name"]) :
# Modify the noiser name already present appending a unique key to it
noiser_name = noiser_name + '-#' + str(k)
#noise_dict.setdefault(col_name_dictionary["column_name"], []).append( {noiser_name : params_to_be_added} )
noise_dict.setdefault(col_name_dictionary["column_name"], {})[noiser_name] = params_to_be_added
return noise_dict


def _generate_cartesian_product_combinations(self, d: dict) -> list:
"""
Helper function for creating cartesian product combinations out of a dictionary.
TODO expand explanation
"""
keys = d.keys()
value_lists = d.values()

# Generate Cartesian product of value lists
combinations = product(*value_lists)

# Create dictionaries for each combination
result = []
for combination in combinations:
combined_dict = {}
for key, value in zip(keys, combination):
nested_dict = {value : d[key][value]}
combined_dict.update({key: nested_dict})
result.append(combined_dict)

return result



def _handle_parameter_selection(self, d: dict, param_index: int) -> dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not have "_" as it calls self in the arguments

"""
TODO helper fucntion section
"""

for key, param_dict in d.items():
# remove the appendix used to handle same noise names for same column_name, this is done in the _transform_noise_dict function, this line does nothing if that key is not present afterall
key = key.split('-#')[0]
# handle "defualt" as params value returning a empty dict
if param_dict == 'default':
return {"name" : key, "params" : {}}
else:
tmp_param_dict = {}
# iterate through the possible multiple parameter otpions
for param_name, param_value in param_dict.items():
tmp_param_dict[param_name] = param_value[param_index]
return {"name": key, "params": tmp_param_dict}



def noise_column_wise_combination(self) -> list:
"""
works on the self.noise_arg dictionary to compute all column wise combinations for parametrs and noise function specified.
The combinations of noisers is all against all, except there can not be two noisers for the same column_name.
Combinations of noisers will always include at least one noiser per column_name.
example for noisers ->

column_name : 1 column_name : 2
name : [noiser1, noiser2] name: [othernoiser]

combinations ->
noiser1 - othernoiser
noiser2 - othernoiser

Now this is how noiser functions are selected but for each of the above combination there are as many as there are parameters.
Again an example shows it better ->

column_name : 1 column_name : 2
name : [noiser1, noiser2] name: [othernoiser]
parameters : [{p1 : [1 ,2 ,3]}, {p1 : [1.5, 2.5, 3.5 ]}] parameters : [{p1 : [4 ,5 ,6], p2 : [7, 8, 9]}]

combinations ->
noiser1 (p1 = 1) - othernoiser (p1 = 4, p2 = 7)
noiser1 (p1 = 2) - othernoiser (p1 = 5, p2 = 8)
noiser1 (p1 = 3) - othernoiser (p1 = 6, p2 = 9)
noiser2 (p1 = 1.5) - othernoiser (p1 = 4, p2 = 7)
noiser2 (p1 = 2.5) - othernoiser (p1 = 5, p2 = 8)
noiser2 (p1 = 3.5) - othernoiser (p1 = 6, p2 = 9)
"""

# transform noise entry in a nested dictionary, with structure {col_name: { noiser_name : {p1 : [1]} }}
noise_as_dict = self._transform_noise_dict()

# Create cartesian product of noiser names based on the above dictionary
noiser_combination_list = self._generate_cartesian_product_combinations(noise_as_dict)

# for each noiser combination create the column wise selection of parameters associated
all_noise_combination = []
for noise_combo in noiser_combination_list:
# select the parameter iterating through the total number of parameters value fopr each col type
for params_index in range(self.number_culumn_wise_val):
noise_list = []
for col_name, noise_dict in noise_combo.items():
single_param_dict = self._handle_parameter_selection(noise_dict, params_index)
# add the column_name field to this dictionary
single_param_dict["column_name"] = col_name
# reorder the entries by key alphabetically for readability
sorted_dict = {key: single_param_dict[key] for key in sorted(single_param_dict)}
noise_list.append(sorted_dict)
all_noise_combination.append(noise_list)
return all_noise_combination



def noise_all_combination(self) -> list:
"""
works on the self.noise_arg dictionary to compute all possible combinations of parameters and nboisers in a all against all fashion.
"""

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible to write an issue about this ?

# TODO implement this function
raise ValueError("the function noise_all_combination for the flag interpret_parmas_mode : all_combinations is not implemented yet ")



def split_combination(self) -> list:
"""
TODO add description
"""

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible to write an issue about this ?

list_split_comibinations = []
# iterate through the split entry and return a list of split possibilities, where each splitter_name has one/set of one parametyers
for i, split_dict in enumerate(self.split_arg):
# jsut create a new dictionary for each set of params associated to each split_name, basically if a splitter has more than one element in his params: then they should be decoupled so to have each splitter with only one value for params:
# if the value of params: is "default" just return the dictionary with an empty dict as value of params :
if split_dict['params'] == "default":
split_dict['params'] = {}
list_split_comibinations.append({ "split" : [split_dict]})
else:
# Get lengths of all lists
lengths = {key: len(value) for key, value in split_dict['params'][0].items()}

# Check if all lengths are the same
all_lengths_same = set(lengths.values())

if len(all_lengths_same) != 1 :
raise ValueError(f"All split params for teh same splitter have to have the same number of elements, this splitter does not: {split_dict['name']}.")
else:
# iterate at level of number of params_values
for params_index in range(list(all_lengths_same)[0]):
# making the split into a dict the _handle_parameter_selection can use
single_param_dict = self._handle_parameter_selection({split_dict['name']: split_dict['params'][0] }, params_index)
list_split_comibinations.append(single_param_dict)
return list_split_comibinations
65 changes: 50 additions & 15 deletions bin/launch_interpret_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
import argparse
import json
from json_schema import JsonSchema
import os


def get_args():

"get the arguments when using from the commandline"
"""get the arguments when using from the commandline
TODO write help function description"""

parser = argparse.ArgumentParser(description="")
parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')

parser.add_argument("-d", "--out_dir", type=str, required=True, metavar="DIR", help='The output dir where all he jason are written to. Output Json will be called input_json_nam-#[number].json')

args = parser.parse_args()
return args

Expand All @@ -27,30 +30,62 @@ def interpret_json(input_json: dict) -> list:
# Initialize json schema it checks for correctness of the Json architecture and fields / values
schema = JsonSchema(input_json)

#print("\nnoise_configurations :\n", schema.noise_arg, "\n", type(schema.noise_arg))
#print("\nsplit_configurations :\n", schema.split_arg, "\n", type(schema.split_arg))
#print("\ncustom_configurations :\n", schema.custom_arg, "\n", type(schema.custom_arg))
print(schema.number_culumn_wise_val)
#print(schema.experiment, schema.interpret_params_mode, schema.column_names)

# compute all noise combinations
# first set right fucntion call based on schema.interpret_params_mode, done like following because if are inefficient
# both function output an empty list if there is no noise argument
function_call_dict = {"culumn_wise": schema.noise_column_wise_combination, "all_combinations": schema.noise_all_combination}
list_noise_combinations = function_call_dict[schema.interpret_params_mode]()

# compute all split combinations, this will only be all vs all because there is no concept of column_name, it will return empty list if there is no split function
list_split_combinations = schema.split_combination()

# combine split possibilities with noise ones in a all vs all manner, each splitter wil be assigned to each noiser
list_of_json_to_write = []

# Check if both lists are empty
if not list_noise_combinations and not list_split_combinations:
list_of_json_to_write.append({"experiment": schema.experiment})
else:
if not list_split_combinations: # Check if list_split_combinations is empty
for noiser_dict in list_noise_combinations:
list_of_json_to_write.append({"experiment": schema.experiment, "noise": noiser_dict})
else:
for splitter_dict in list_split_combinations:
if not list_noise_combinations: # Check if list_noise_combinations is empty
list_of_json_to_write.append({"experiment": schema.experiment, "split": splitter_dict})
else:
list_of_json_to_write.append({"experiment": schema.experiment, "noise": noiser_dict, "split": splitter_dict})

# deal wiht custom if present, in this case nothing at all will be done to the dictionary, it will just be passed as it is
for custom_dict in schema.custom_arg :
list_of_json_to_write.append(custom_dict)

return list_of_json_to_write




def main(config_json: str) -> str:
def main(config_json: str, out_dir_path: str) -> str:

# open and read Json
config = {}
with open(config_json, 'r') as in_json:
config = json.load(in_json)

# initialize the json scheme class
interpret_json(config)

# interpret the json
list_json = interpret_json(config)

# write all the resultin json files
# Create the directory if it doesn't exist
os.makedirs(out_dir_path, exist_ok=True)


# Populate the directory with files containing the single SJon combination
for i, elements in enumerate(list_json):
suffix = os.path.splitext(os.path.basename(config_json))[0]
file_path = os.path.join(out_dir_path, f"{suffix}-#{i+1}.json")
with open(file_path, 'w') as file:
file.write(f"{elements}\n")


if __name__ == "__main__":
args = get_args()
main(args.json)
main(args.json, args.out_dir)
15 changes: 9 additions & 6 deletions examples/pipeline_generated.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,28 @@
"experiment": "DnaToFloatExperiment",
"noise": [
{
"column_name": "input1",
"column_name": "inphello:input1:dnaut1",
"name": "UniformTextMasker",
"params": {"probability": 0.1}
},
{
"column_name": "input2",
"column_name": "hello:input2:prot",
"name": "UniformTextMasker",
"params": {"probability": 0.4}
},
{
"column_name": "label",
"column_name": "hola:label:float",
"name": "GaussianNoise",
"params": {"mean": 0.5, "std": 0.1}
}
],
"split": [
"split":
{
"name": "RandomSplitter",
"params": {"split": [0.6, 0.8]}

"params": {"split": [0.6, 0.2, 0.2]}



}
]
}
Loading
Loading