Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Firstpipe module #35

Merged
merged 12 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 138 additions & 1 deletion bin/json_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

from abc import ABC, abstractmethod
from typing import Literal
from typing import Literal
from itertools import product

class JsonSchema(ABC):
"""
Expand Down Expand Up @@ -83,7 +84,143 @@ def _check_params_schema(self) -> int:
return num_params_list[0]
else:
raise ValueError(f"Expected the same number of values for all the params under noise value, but received a discordant ammount instead.")


def _transform_noise_dict(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be transform_noise_dict without "_" since this calls self.

"""
TODO helper fucntion section
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should provide enough documentation with examples

noise_dict = {}
for col_name_dictionary in self.noise_arg:
# The name: field of a noise: can be either a simlpe string or list of strings, so convert such variable to a list if it's a string, otherwise leave it unchanged
noiser_list = [col_name_dictionary['name']] if isinstance(col_name_dictionary['name'], str) else col_name_dictionary['name']
# Now get the parametrs or set of parameters associated with each noiser and store bot in a tuple and append to list noiser names associated to a given clumn_name
for k, noiser_name in enumerate(noiser_list):
# handle the fact that params can have "default" as value and not a list
if col_name_dictionary['params'] == "default":
params_to_be_added = "default"
else:
params_to_be_added = col_name_dictionary['params'][k]
# handle the case of multiple noiser with same name in the same list associated to the column_name, solution -> create a scheme to modify the name
if noise_dict.get(col_name_dictionary["column_name"]) and noiser_name in noise_dict.get(col_name_dictionary["column_name"]) :
# Modify the noiser name already present appending a unique key to it
noiser_name = noiser_name + '-#' + str(k)
#noise_dict.setdefault(col_name_dictionary["column_name"], []).append( {noiser_name : params_to_be_added} )
noise_dict.setdefault(col_name_dictionary["column_name"], {})[noiser_name] = params_to_be_added
return noise_dict


def _generate_cartesian_product_combinations(self, d: dict) -> list:
"""
Helper function for creating cartesian product combinations out of a dictionary.
TODO expand explanation
"""
keys = d.keys()
value_lists = d.values()

# Generate Cartesian product of value lists
combinations = product(*value_lists)

# Create dictionaries for each combination
result = []
for combination in combinations:
combined_dict = {}
for key, value in zip(keys, combination):
nested_dict = {value : d[key][value]}
combined_dict.update({key: nested_dict})
result.append(combined_dict)

return result



def _handle_parameter_selection(self, d: dict, param_index: int) -> dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not have "_" as it calls self in the arguments

"""
TODO helper fucntion section
"""

for key, param_dict in d.items():
# remove the appendix used to handle same noise names for same column_name, this is done in the _transform_noise_dict function, this line does nothing if that key is not present afterall
key = key.split('-#')[0]
# handle "defualt" as params value
if param_dict == 'default':
return {"name" : key, "params" : param_dict}
else:
tmp_param_dict = {}
# iterate through the possible multiple parameter otpions
for param_name, param_value in param_dict.items():
tmp_param_dict[param_name] = param_value[param_index]
return {"name": key, "params": tmp_param_dict}



def noise_column_wise_combination(self) -> list:
"""
works on the self.noise_arg dictionary to compute all column wise combinations for parametrs and noise function specified.
The combinations of noisers is all against all, except there can not be two noisers for the same column_name.
Combinations of noisers will always include at least one noiser per column_name.
example for noisers ->

column_name : 1 column_name : 2
name : [noiser1, noiser2] name: [othernoiser]

combinations ->
noiser1 - othernoiser
noiser2 - othernoiser

Now this is how noiser functions are selected but for each of the above combination there are as many as there are parameters.
Again an example shows it better ->

column_name : 1 column_name : 2
name : [noiser1, noiser2] name: [othernoiser]
parameters : [{p1 : [1 ,2 ,3]}, {p1 : [1.5, 2.5, 3.5 ]}] parameters : [{p1 : [4 ,5 ,6], p2 : [7, 8, 9]}]

combinations ->
noiser1 (p1 = 1) - othernoiser (p1 = 4, p2 = 7)
noiser1 (p1 = 2) - othernoiser (p1 = 5, p2 = 8)
noiser1 (p1 = 3) - othernoiser (p1 = 6, p2 = 9)
noiser2 (p1 = 1.5) - othernoiser (p1 = 4, p2 = 7)
noiser2 (p1 = 2.5) - othernoiser (p1 = 5, p2 = 8)
noiser2 (p1 = 3.5) - othernoiser (p1 = 6, p2 = 9)
"""

# transform noise entry in a nested dictionary, with structure {col_name: { noiser_name : {parameters : {p1 : [1]} }}}
noise_as_dict = self._transform_noise_dict()

# Create cartesian product of noiser names based on the above dictionary
noiser_combination_list = self._generate_cartesian_product_combinations(noise_as_dict)

# for each noiser combination create the column wise selection of parameters associated
all_noise_combination = []
for noise_combo in noiser_combination_list:
# select the parameter iterating through the total number of parameters value fopr each col type
for params_index in range(self.number_culumn_wise_val):
noise_list = []
for col_name, noise_dict in noise_combo.items():
single_param_dict = self._handle_parameter_selection(noise_dict, params_index)
# add the column_name field to this dictionary
single_param_dict["column_name"] = col_name
# reorder the entries by key alphabetically for readability
sorted_dict = {key: single_param_dict[key] for key in sorted(single_param_dict)}
noise_list.append(sorted_dict)
all_noise_combination.append({'noise' : noise_list })
return all_noise_combination



def noise_all_combination(self) -> list:
"""
works on the self.noise_arg dictionary to compute all possible combinations of parameters and nboisers in a all against all fashion.
"""

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible to write an issue about this ?

# TODO implement this function
raise ValueError("the function noise_all_combination for the flag interpret_parmas_mode : all_combinations is not implemented yet ")



def split_combination(self) -> list:
"""
TODO add description
"""

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible to write an issue about this ?

# iterate through the split entry and return a list of split possibilities, where each splitter_name has one/set of one parametyers
18 changes: 13 additions & 5 deletions bin/launch_interpret_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

def get_args():

"get the arguments when using from the commandline"
"""get the arguments when using from the commandline
TODO write help function description"""

parser = argparse.ArgumentParser(description="")
parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')
Expand All @@ -30,11 +31,18 @@ def interpret_json(input_json: dict) -> list:
#print("\nnoise_configurations :\n", schema.noise_arg, "\n", type(schema.noise_arg))
#print("\nsplit_configurations :\n", schema.split_arg, "\n", type(schema.split_arg))
#print("\ncustom_configurations :\n", schema.custom_arg, "\n", type(schema.custom_arg))
print(schema.number_culumn_wise_val)
#print(schema.number_culumn_wise_val)
#print(schema.experiment, schema.interpret_params_mode, schema.column_names)


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, prints should not be used and replaced by a logger if you are using those to debug (see https://docs.python.org/3/howto/logging.html )


# compute all noise combinations
# first set right fucntion call based on schema.interpret_params_mode, done like following because if are inefficient
function_call_dict = {"culumn_wise": schema.noise_column_wise_combination, "all_combinations": schema.noise_all_combination}
list_noise_combinations = function_call_dict[schema.interpret_params_mode]()
print(list_noise_combinations, len(list_noise_combinations))

# compute all split combinations, this will only be all vs all because there is no concept of column_name
list_split_combinations = schema.split_combination()
print(list_split_combinations, len(list_split_combinations))


def main(config_json: str) -> str:
Expand All @@ -44,7 +52,7 @@ def main(config_json: str) -> str:
with open(config_json, 'r') as in_json:
config = json.load(in_json)

# initialize the json scheme class
# interpret the json
interpret_json(config)


Expand Down
14 changes: 7 additions & 7 deletions examples/pipeline_generated.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@
"experiment": "DnaToFloatExperiment",
"noise": [
{
"column_name": "input1",
"column_name": "inphello:input1:dnaut1",
"name": "UniformTextMasker",
"params": [{"probability": [0.1]}]
"params": {"probability": 0.1}
},
{
"column_name": "input2",
"column_name": "hello:input2:prot",
"name": "UniformTextMasker",
"params": [{"probability": [0.4]}]
"params": {"probability": 0.4}
},
{
"column_name": "label",
"column_name": "hola:label:float",
"name": "GaussianNoise",
"params": [{"mean": [0.5], "std": [0.1]}]
"params": {"mean": 0.5, "std": 0.1}
}
],
"split": [
{
"name": "RandomSplitter",
"params": [{"split": [[0.6, 0.8]]}]
"params": {"split": [0.6, 0.8]}
}
]
}
16 changes: 10 additions & 6 deletions examples/user_given.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
{
"experiment": "DnaToFloatExperiment",
"experiment": "MyCustomExperiment",
"interpret_parmas_mode": "culumn_wise",
"noise": [
{
"column_name": "input1",
"name": ["UniformTextMasker", "AnotherNoiser", "YetAnotherNoiser"],
"column_name": "hello:input1:dna",
"name": ["UniformTextMasker", "AnotherNoiser", "AnotherNoiser"],
"params": [{"probability": [0.1, 0.2, 0.3]}, {"probability": [0.11, 0.21, 0.31]}, {"probability": [0.12, 0.22, 0.32]}]
},
{
"column_name": "input2",
"column_name": "hello:input2:prot",
"name": ["UniformTextMasker", "AnotherNoiser"],
"params": [{"probability": [0.4, 0.5, 0.6]}, {"probability": [0.1, 0.2, 0.3]}]
"params": "default"
},
{
"column_name": "label",
"column_name": "hola:label:float",
"name": "GaussianNoise",
"params": [{"mean": [0.5, 0.6, 0.7], "std": [0.1, 0.2, 0.3]}]
}
Expand All @@ -22,6 +22,10 @@
{
"name": "RandomSplitter",
"params": [{"split": [[0.6, 0.8], [0.7, 0.85]]}]
},
{
"name": "SomeSplitter",
"params": "default"
}
],
"custom": [
Expand Down
Loading