Skip to content

Commit 984b3f0

Browse files
Merge pull request #26 from mathysgrapotte/composition_data_types
2 parents fcc320e + 3429eff commit 984b3f0

File tree

10 files changed

+82
-296
lines changed

10 files changed

+82
-296
lines changed

bin/src/data/csv_parser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ def get_and_encode(self, dictionary: dict, idx: Any) -> dict:
9797

9898
# encode the data at given index
9999
# For that, it first retrieves the data object and then calls the encode_all method to encode the data
100-
output[name] = self.experiment.__getattribute__(data_type.lower()).encode_all(data)
100+
101+
102+
output[name] = self.experiment.get_encoding_all(data_type)(dictionary[key][idx])
103+
101104

102105
return output
103106

bin/src/data/data_types/data_types.py

Lines changed: 0 additions & 155 deletions
This file was deleted.

bin/src/data/data_types/encoding/encoders.py renamed to bin/src/data/encoding/encoders.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,33 @@ def decode(self, data: np.array) -> str:
8989
"""
9090
return self.encoder.inverse_transform(data)
9191

92+
class FloatEncoder(AbstractEncoder):
93+
"""
94+
Encoder for float data.
95+
"""
96+
def encode(self, data: float) -> float:
97+
"""
98+
Encodes the data.
99+
This method takes as input a single data point, should be mappable to a single output.
100+
"""
101+
return float(data)
102+
103+
def encode_all(self, data: list) -> list:
104+
"""
105+
Encodes the data.
106+
This method takes as input a list of data points, should be mappable to a single output.
107+
"""
108+
109+
# check if data is a string, in that case it should use the encode sequence method
110+
if isinstance(data, str):
111+
return [self.encode(data)]
112+
else:
113+
return [float(d) for d in data]
114+
115+
def decode(self, data: float) -> float:
116+
"""
117+
Decodes the data.
118+
"""
119+
return data
120+
121+

bin/src/data/experiments.py

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99

1010
from abc import ABC, abstractmethod
1111
from typing import Any
12-
from .data_types import data_types as data_types
1312
from .spliters import spliters as spliters
13+
from .encoding import encoders as encoders
14+
from .noise import noise_generators as noise_generators
1415
from copy import deepcopy
16+
1517
import numpy as np
1618

1719
class AbstractExperiment(ABC):
@@ -24,8 +26,6 @@ def __init__(self, seed: float = None) -> None:
2426
# allow ability to add a seed for reproducibility
2527
self.seed = seed
2628

27-
#self.random_splitter = spliters.RandomSplitter(seed=seed)
28-
2929

3030
def get_split_indexes(self, data: list, split: tuple) -> list | list | list:
3131
"""
@@ -51,48 +51,26 @@ def get_keys_based_on_name_data_type_or_input(self, data: dict, column_name: str
5151

5252
if column_name is not None:
5353
return [key for key in data if column_name in key.split(':')[0]]
54+
5455
if data_type is not None:
5556
if category is not None:
5657
return [key for key in data if data_type in key.split(':')[1] and category in key.split(':')[2]]
5758
else:
5859
return [key for key in data if data_type in key.split(':')[1]]
59-
60-
61-
def noise(self, data: Any, noise_method: str, **noise_params: dict) -> Any:
60+
61+
def get_encoding_all(self, data_type: str) -> Any:
6262
"""
63-
Adds noise to the data, using function defined in self.noise
63+
This method gets the encoding function for a specific data type.
6464
"""
65-
# check if noise_method exist in the class, if it does, call it with the associated **noise_params, if not raise an error
66-
67-
if hasattr(self, noise_method):
68-
return getattr(self, noise_method)(data, **noise_params)
69-
else:
70-
raise NotImplementedError(f"No noise method {noise_method} in the class {self.__class__.__name__}")
71-
65+
return getattr(self, data_type)['encoder'].encode_all
7266

7367
class DnaToFloatExperiment(AbstractExperiment):
7468
"""
7569
Class for dealing with DNA to float predictions (for instance regression from DNA sequence to CAGE value)
7670
"""
77-
78-
def __init__(self, seed: float = None, **parameters) -> None:
79-
super().__init__(seed)
80-
self.dna = data_types.Dna(**parameters)
81-
self.float = data_types.Float(**parameters)
82-
83-
def noise_dna_uniform_masker(self, data: dict, **noise_params) -> dict:
84-
"""
85-
Adds noise to the data of a single input.
86-
Applied on all input keys that have the dna data type.
87-
"""
88-
89-
90-
dna_type_keys = self.get_keys_based_on_name_data_type_or_input(data, data_type='dna')
91-
92-
for key in dna_type_keys:
93-
data[key] = self.dna.add_noise_uniform_text_masker_all_inputs(data[key], seed=self.seed, **noise_params)
94-
95-
return data
96-
97-
71+
def __init__(self):
72+
super().__init__()
73+
self.dna = {'encoder': encoders.TextOneHotEncoder(alphabet='acgt'), 'noise_generators': {'uniform_text_masker': noise_generators.UniformTextMasker()}}
74+
self.float = {'encoder': encoders.FloatEncoder(), 'noise_generators': {'uniform_float_masker': noise_generators.GaussianNoise()}}
75+
#self.protein = {'encoder': encoders.TextOneHotEncoder(), 'noise_generators': {'uniform_text_masker': noise_generators.UniformTextMasker()}}
9876

bin/src/data/handlertorch.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,16 @@ def convert_list_of_numpy_arrays_to_tensor(self, data: list) -> Tuple[torch.Tens
2121
"""
2222
Converts a list of numpy arrays to a tensor.
2323
If the list includes numpy arrays of different shapes, padd the numpy arrays and return a mask tensor, otherwise mask tensor is set to None
24+
25+
# TODO: This method utilizes ifs to check the shape of the data. this is not ideal. Performance improvement could be done here.
2426
"""
2527
if len(data) > 1:
28+
# check if data is a flat list (of float or integers):
29+
if isinstance(data[0], (float, int)):
30+
return torch.tensor(data), None
31+
2632
# check if the data is of different shapes
27-
if len(set([d.shape for d in data])) == 1:
33+
elif len(set([d.shape for d in data])) == 1:
2834
return torch.tensor(np.array(data)), None
2935

3036
# otherwise, pad the data and build a mask tensor that points to where the data has been padded.

bin/src/data/data_types/noise/noise_generators.py renamed to bin/src/data/noise/noise_generators.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,13 @@ def add_noise(self, data: Any, seed: float = None) -> Any:
2525
# np.random.seed(seed)
2626
raise NotImplementedError
2727

28-
def add_noise_multiprocess(self, data: list, seed: float = None) -> list:
28+
@abstractmethod
29+
def add_noise_all(self, data: list, seed: float = None) -> list:
2930
"""
30-
Adds noise to the data using multiprocessing.
31+
Adds noise to the data.
3132
"""
32-
with mp.Pool(mp.cpu_count()) as pool:
33-
# reshaping the inputs of this function to meet starmap requirements, basically adding into a tuple the list[elem] + seed
34-
function_specific_input = [(item, seed) for item in data]
35-
return pool.starmap(self.add_noise, function_specific_input)
33+
# np.random.seed(seed)
34+
raise NotImplementedError
3635

3736

3837
class UniformTextMasker(AbstractNoiseGenerator):
@@ -47,13 +46,16 @@ def add_noise(self, data: str, probability: float = 0.1, mask='N', seed: float =
4746
np.random.seed(seed)
4847
return ''.join([c if np.random.rand() > probability else mask for c in data])
4948

50-
def add_noise_multiprocess(self, data: list, probability: float = 0.1, mask='N', seed: float = None) -> list:
49+
50+
def add_noise_all(self, data: list, probability: float = 0.1, mask='N', seed: float = None) -> list:
51+
5152
"""
5253
Adds noise to the data using multiprocessing.
5354
"""
5455
with mp.Pool(mp.cpu_count()) as pool:
5556
function_specific_input = [(item, probability, mask, seed) for item in data]
5657
return pool.starmap(self.add_noise, function_specific_input)
58+
5759

5860
class GaussianNoise(AbstractNoiseGenerator):
5961
"""
@@ -67,10 +69,11 @@ def add_noise(self, data: float, mean: float = 0, std: float= 0, seed: float = N
6769
np.random.seed(seed)
6870
return data + np.random.normal(mean, std)
6971

70-
def add_noise_multiprocess(self, data: list, mean: float = 0, std: float = 0, seed: float = None) -> list:
72+
def add_noise_all(self, data: list, mean: float = 0, std: float = 0, seed: float = None) -> list:
7173
"""
7274
Adds noise to the data using np arrays
7375
# TODO return a np array to gain performance.
7476
"""
7577
np.random.seed(seed)
76-
return list(np.array(data) + np.random.normal(mean, std, len(data)))
78+
return list(np.array(data) + np.random.normal(mean, std, len(data)))
79+

0 commit comments

Comments
 (0)