Skip to content

Commit 87b77cd

Browse files
Merge pull request #24 from mathysgrapotte/json_to_noise_func_call
Json to noise func call
2 parents 6afdc69 + da48719 commit 87b77cd

File tree

7 files changed

+106
-40
lines changed

7 files changed

+106
-40
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@ works
1111
/singularity_cache
1212
/results/
1313
.coverage
14-
.vscode
14+
.vscode/
15+
bin/.vscode/
16+

bin/src/data/__init__.py

Whitespace-only changes.

bin/src/data/data_types/data_types.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ def encode_all(self, data: list) -> np.array:
2929

3030
class Dna(AbstractType):
3131
"""
32-
class for dealing with DNA data
32+
class for dealing with DNA data
33+
# TODO make a Text base class for this class and other text based classes (rna, protein etc...)
3334
"""
3435

3536
def __init__(self, **parameters) -> None:
3637
self.one_hot_encoder = encoders.TextOneHotEncoder(alphabet=parameters.get("one_hot_encoder_alphabet", "acgt"))
37-
self.uniform_text_masker = noise_generators.UniformTextMasker(probability=parameters.get("text_masker_probability", 0.1))
38+
self.uniform_text_masker = noise_generators.UniformTextMasker()
3839

3940
def one_hot_encode(self, data: str) -> np.array:
4041
"""
@@ -62,26 +63,30 @@ def encode_all(self, data: list, encoder: Literal['one_hot'] = 'one_hot') -> lis
6263
raise ValueError(f"Unknown encoder {encoder}")
6364

6465

65-
def add_noise_uniform_text_masker(self, data: str, seed: float = None) -> str:
66+
def add_noise_uniform_text_masker(self, data: str, seed: float = None, **noise_params) -> str:
6667
"""
6768
Adds noise to the data of a single input.
6869
"""
69-
return self.uniform_text_masker.add_noise(data, seed=seed)
70+
# get the probability param from noise_params, default value is set to 0.1
71+
probability = noise_params.get("probability", 0.1)
72+
return self.uniform_text_masker.add_noise(data, probability=probability, seed=seed)
7073

71-
def add_noise_uniform_text_masker_all_inputs(self, data: list, seed: float = None) -> list:
74+
def add_noise_uniform_text_masker_all_inputs(self, data: list, seed: float = None, **noise_params) -> list:
7275
"""
7376
Adds noise to the data of multiple inputs.
7477
"""
75-
return self.uniform_text_masker.add_noise_multiprocess(data, seed=seed)
78+
# get the probability param from noise_params, default value is set to 0.1
79+
probability = noise_params.get("probability", 0.1)
80+
return self.uniform_text_masker.add_noise_multiprocess(data, probability=probability, seed=seed)
7681

7782

7883
class Float():
7984
"""
8085
class for dealing with float data
8186
"""
8287

83-
def __init__(self, **parameters) -> None:
84-
self.gaussian_noise = noise_generators.GaussianNoise(mean=parameters.get("gaussian_noise_mean", 0), std=parameters.get("gaussian_noise_std", 1))
88+
def __init__(self) -> None:
89+
self.gaussian_noise = noise_generators.GaussianNoise()
8590

8691
def add_noise_gaussian_noise(self, data: float, seed: float = None) -> float:
8792
"""

bin/src/data/data_types/noise/noise_generators.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,39 +40,43 @@ class UniformTextMasker(AbstractNoiseGenerator):
4040
This noise generators replace characters with 'N' with a given probability.
4141
"""
4242

43-
def __init__(self, probability: float = 0.1) -> None:
44-
self.probability = probability
4543

46-
47-
def add_noise(self, data: str, seed: float = None) -> str:
44+
def add_noise(self, data: str, probability: float = 0.1, seed: float = None) -> str:
4845
"""
4946
Adds noise to the data.
5047
"""
5148

5249
np.random.seed(seed)
53-
return ''.join([c if np.random.rand() > self.probability else 'N' for c in data])
54-
50+
return ''.join([c if np.random.rand() > probability else 'N' for c in data])
51+
52+
def add_noise_multiprocess(self, data: list, probability: float = 0.1, seed: float = None) -> list:
53+
"""
54+
Adds noise to the data using multiprocessing.
55+
"""
56+
57+
with mp.Pool(mp.cpu_count()) as pool:
58+
function_specific_input = [(item, probability, seed) for item in data]
59+
return pool.starmap(self.add_noise, function_specific_input)
60+
5561
class GaussianNoise(AbstractNoiseGenerator):
5662
"""
5763
This noise generator adds gaussian noise to float values
5864
"""
5965

60-
def __init__(self, mean: float = 0, std: float = 1) -> None:
61-
self.mean = mean
62-
self.std = std
6366

64-
def add_noise(self, data: float, seed: float = None) -> float:
67+
def add_noise(self, data: float, mean: float = 0, std: float= 0, seed: float = None) -> float:
6568
"""
6669
Adds noise to a single point of data.
6770
"""
6871

6972
np.random.seed(seed)
70-
return data + np.random.normal(self.mean, self.std)
73+
return data + np.random.normal(mean, std)
7174

72-
def add_noise_multiprocess(self, data: list, seed: float = None) -> list:
75+
def add_noise_multiprocess(self, data: list, mean: float = 0, std: float = 0, seed: float = None) -> list:
7376
"""
7477
Adds noise to the data using np arrays
78+
# TODO return a np array to gain performance.
7579
"""
7680

7781
np.random.seed(seed)
78-
return list(np.array(data) + np.random.normal(self.mean, self.std, len(data)))
82+
return list(np.array(data) + np.random.normal(mean, std, len(data)))

bin/src/data/experiments.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import Any
1212
from .data_types import data_types as data_types
1313
from .spliters import spliters as spliters
14+
from copy import deepcopy
1415
import numpy as np
1516

1617
class AbstractExperiment(ABC):
@@ -32,12 +33,42 @@ def get_split_indexes(self, data: list, split: tuple) -> list | list | list:
3233
"""
3334
raise NotImplementedError
3435

36+
def get_keys_based_on_name_data_type_or_input(self, data: dict, column_name: str = None, data_type: str = None, category = None) -> list:
37+
"""
38+
Returns the keys of the data that are of a specific type, name or category.
39+
If the column_name is specified, it will return all the keys that contain the column_name in their name.
40+
If the data_type is specified, it will return all the keys that contain the data_type in their name.
41+
If the data_type and the category are specified, it will return all the keys that contain the data_type and the category in their name.
42+
"""
43+
44+
# Check that one of column_name, data_type or category is not None
45+
if column_name is None and data_type is None and category is None:
46+
raise ValueError("At least one of column_name, data_type or category should be specified.")
47+
48+
# Check that category is not the only one specified
49+
if category is not None and column_name is None and data_type is None:
50+
raise ValueError("category cannot be the only one specified.")
51+
52+
if column_name is not None:
53+
return [key for key in data if column_name in key.split(':')[0]]
54+
if data_type is not None:
55+
if category is not None:
56+
return [key for key in data if data_type in key.split(':')[1] and category in key.split(':')[2]]
57+
else:
58+
return [key for key in data if data_type in key.split(':')[1]]
59+
3560

36-
def noise(self, data: Any) -> Any:
61+
def noise(self, data: Any, noise_method: str, **noise_params: dict) -> Any:
3762
"""
38-
Adds noise to the data.
63+
Adds noise to the data, using function defined in self.noise
3964
"""
40-
raise NotImplementedError
65+
# check if noise_method exist in the class, if it does, call it with the associated **noise_params, if not raise an error
66+
67+
if hasattr(self, noise_method):
68+
return getattr(self, noise_method)(data, **noise_params)
69+
else:
70+
raise NotImplementedError(f"No noise method {noise_method} in the class {self.__class__.__name__}")
71+
4172

4273
class DnaToFloatExperiment(AbstractExperiment):
4374
"""
@@ -49,18 +80,19 @@ def __init__(self, seed: float = None, **parameters) -> None:
4980
self.dna = data_types.Dna(**parameters)
5081
self.float = data_types.Float(**parameters)
5182

52-
def add_noise(self, data: list) -> list:
83+
def noise_dna_uniform_masker(self, data: dict, **noise_params) -> dict:
5384
"""
5485
Adds noise to the data of a single input.
86+
Applied on all input keys that have the dna data type.
5587
"""
56-
return self.dna.add_noise_uniform_text_masker_all_inputs(data, seed=self.seed)
57-
58-
def noise_scheme(self, data: list, params: dict) -> dict:
59-
output = {}
60-
for key in params:
61-
output[key] = self.add_noise(data, params[key])
6288

63-
return output
89+
90+
dna_type_keys = self.get_keys_based_on_name_data_type_or_input(data, data_type='dna')
91+
92+
for key in dna_type_keys:
93+
data[key] = self.dna.add_noise_uniform_text_masker_all_inputs(data[key], seed=self.seed, **noise_params)
94+
95+
return data
6496

6597

6698

bin/tests/test_experiments.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,32 @@
22
import numpy.testing as npt
33
import unittest
44
from bin.src.data.experiments import DnaToFloatExperiment
5+
from copy import deepcopy
56

67
class TestDnaToFloatExperiment(unittest.TestCase):
78

89
def setUp(self):
910
self.dna_to_float_experiment = DnaToFloatExperiment()
10-
11+
12+
def test_noise(self):
13+
# Test calling the noise method using a kwargs dictionary
14+
noise_method_noise_dna_uniform_masker = 'noise_dna_uniform_masker'
15+
kwarg_dict = {'probability': 0.5}
16+
original_data = {
17+
"sequences1:dna:input": ["ACGTACGT", "ACGTACGT", "ACGTACGT", "ACGTACGT"],
18+
"sequences2:dna:input": ["ACGTACGT", "ACGTACGT", "ACGTACGT", "ACGTACGT"],
19+
"float1:float:label": [1.0, 2.0, 3.0, 4.0],
20+
"float2:float:label": [1.0, 2.0, 3.0, 4.0]
21+
}
22+
23+
data = deepcopy(original_data)
24+
noisy_data = self.dna_to_float_experiment.noise(data, noise_method_noise_dna_uniform_masker, **kwarg_dict)
25+
self.assertIsInstance(noisy_data, dict)
26+
self.assertEqual(len(noisy_data), 4)
27+
# checking if the noise was applied to the correct keys, meaning that the sequences have changed
28+
self.assertNotEqual(noisy_data["sequences1:dna:input"], original_data["sequences1:dna:input"])
29+
self.assertNotEqual(noisy_data["sequences2:dna:input"], original_data["sequences2:dna:input"])
30+
# checking if the noise was not applied to the float keys
31+
self.assertEqual(noisy_data["float1:float:label"], original_data["float1:float:label"])
32+
self.assertEqual(noisy_data["float2:float:label"], original_data["float2:float:label"])
33+

bin/tests/test_noise_generators.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@ def test_add_noise(self):
2626
class TestUniformTextMasker(unittest.TestCase):
2727
def test_add_noise_single(self):
2828
# Test adding noise to a single string
29-
masker = UniformTextMasker(probability=0.1)
29+
masker = UniformTextMasker()
3030
noisy_data = masker.add_noise("ACGTACGT", seed=42)
3131
self.assertIsInstance(noisy_data, str) # making sure output is of correct type
3232
self.assertEqual(noisy_data, "ACGTACNT") # checking if given a seed the noise happens in the same way
3333

3434
def test_add_noise_multiprocess(self):
3535
# Test adding noise to a list of strings using multiprocessing
36-
masker = UniformTextMasker(probability=0.1)
37-
noisy_data_list = masker.add_noise_multiprocess(["ATCGATCGATCG", "ATCG"], seed=42)
36+
masker = UniformTextMasker()
37+
noisy_data_list = masker.add_noise_multiprocess(["ATCGATCGATCG", "ATCG"], seed=42, probability=0.1 )
3838
self.assertIsInstance(noisy_data_list, list) # making sure output is of correct type
3939
self.assertIsInstance(noisy_data_list[0], str)
4040
self.assertIsInstance(noisy_data_list[1], str)
@@ -44,14 +44,14 @@ def test_add_noise_multiprocess(self):
4444
class TestGaussianNoise(unittest.TestCase):
4545
def test_add_noise_single(self):
4646
# Test adding noise to a single float value
47-
noise_generator = GaussianNoise(mean=0, std=1)
48-
noisy_data = noise_generator.add_noise(5.0, seed=42)
47+
noise_generator = GaussianNoise()
48+
noisy_data = noise_generator.add_noise(5.0, seed=42, mean=0, std=1)
4949
self.assertIsInstance(noisy_data, float)
5050
self.assertAlmostEqual(noisy_data, 5.4967141530) # there might be float point variation across systems so not all decimals have to be identical
5151

5252
def test_add_noise_multiprocess(self):
5353
# Test adding noise to a list of float values using multiprocessing
54-
noise_generator = GaussianNoise(mean=0, std=1)
54+
noise_generator = GaussianNoise()
5555
noisy_data = noise_generator.add_noise_multiprocess([1.0, 2.0, 3.0])
5656
self.assertIsInstance(noisy_data, list)
5757
self.assertIsInstance(noisy_data[0], float)

0 commit comments

Comments
 (0)