Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add add_split method inside CsvProcessing and corresponding tests #37

Merged
merged 3 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bin/launch_csv_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def get_args():


def main(data_csv, config_json):
"""
This launcher will be the connection between the csv and one json configuration.
It should also handle some sanity checks.
TODO add a check so that it launches error when split is not defined in json neither in the csv data.
"""

print(data_csv, config_json)

Expand Down
34 changes: 29 additions & 5 deletions bin/src/data/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
The parser is a class that takes as input a CSV file and a experiment class that defines data types to be used, noising procedures, splitting etc.
"""

import numpy as np
import polars as pl
from typing import Any, Tuple, Union
from functools import partial
Expand Down Expand Up @@ -74,15 +75,38 @@ class CsvProcessing(CsvHandler):
"""
Class to load the input csv data and add noise accordingly.
"""

def __init__(self, experiment: Any, csv_path: str) -> None:
super().__init__(experiment, csv_path)
self.data = self.load_csv()


def add_split(self, split_method: str, split: list, seed: float = None, force=False) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name doesnt feel too intuitive for me

"""
Add a column specifying the train, validation, test splits of the data.
An error exception is raised if the split column is already present in the csv file. This behaviour can be overriden by setting force=True.

args:
split_method (str) : The method to split the data, should be one of the keys of the split dictionary in the experiment class.
split (list) : The proportions for [train, validation, test] splits.
seed (float) : The seed for reproducibility.
force (bool) : If True, the split column will be added even if it is already present in the csv file.
"""
if ('split' in self.categories) and (not force):
raise ValueError(f"The category split is already present in the csv file. If you want to still use this function, set force=True")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed that csv already present should raise a warning and not an error, I will merge as is but this should be modified in the future.

# get the indices for train, validation and test using the specified split method
train, validation, test = self.experiment.get_function_split(split_method)(len(self.data), split, seed)

# add the split column to the data
split_column = np.full(len(self.data), np.nan)
split_column[train] = 0
split_column[validation] = 1
split_column[test] = 2
self.data = self.data.with_columns(pl.Series('split:split:int', split_column))

def add_noise(self, configs: list) -> None:
"""
Adds noise to the data.
Noise is added for each column with the configurations specified in the configs list.
Noise is added for each column with the specified configurations.
"""
# for each column configuration
for dictionary in configs:
Expand All @@ -96,11 +120,11 @@ def add_noise(self, configs: list) -> None:
# change the column with the new values
self.data = self.data.with_columns(pl.Series(key, new_column))

def save(self, path: str) -> None:
def save(self, data: pl.DataFrame, path: str) -> None:
"""
Saves the data to a csv file.
"""
self.data.write_csv(path)
data.write_csv(path)


class CsvLoader(CsvHandler):
Expand Down
14 changes: 7 additions & 7 deletions bin/src/data/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ class AbstractExperiment(ABC):
def __init__(self, seed: float = None) -> None:
# allow ability to add a seed for reproducibility
self.seed = seed


def get_split_indexes(self, data: list, split: tuple) -> list | list | list:
"""
Returns the indexes of the split data.
"""
raise NotImplementedError

def get_function_encode_all(self, data_type: str) -> Any:
"""
Expand All @@ -42,6 +35,12 @@ def get_function_noise_all(self, data_type: str, noise_generator: str) -> Any:
This method adds noise to all the entries.
"""
return getattr(self, data_type)['noise_generators'][noise_generator].add_noise_all

def get_function_split(self, split_method: str) -> Any:
"""
This method returns the function for splitting the data.
"""
return self.split[split_method].get_split_indexes


class DnaToFloatExperiment(AbstractExperiment):
Expand All @@ -52,6 +51,7 @@ def __init__(self) -> None:
super().__init__()
self.dna = {'encoder': encoders.TextOneHotEncoder(alphabet='acgt'), 'noise_generators': {'UniformTextMasker': noise_generators.UniformTextMasker(mask='N')}}
self.float = {'encoder': encoders.FloatEncoder(), 'noise_generators': {'GaussianNoise': noise_generators.GaussianNoise()}}
self.split = {'RandomSplitter': spliters.RandomSplitter()}


class ProtDnaToFloatExperiment(DnaToFloatExperiment):
Expand Down
75 changes: 47 additions & 28 deletions bin/src/data/spliters/spliters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,48 +11,67 @@ class AbstractSplitter(ABC):
"""
Abstract class for splitters.
"""
def __init__(self, seed: float = 0) -> None:
# allow ability to add a seed for reproducibility
if seed != 0:
np.random.seed(seed)

@abstractmethod
def split(self, data: list) -> list:
def get_split_indexes(self, length_of_data: int, split: list, seed: float = None) -> list:
"""
Splits the data. Always return indices mapping to the original list.
"""
raise NotImplementedError

@abstractmethod
def distance(self, data_one: Any, data_two: Any) -> float:
"""
Calculates the distance between two elements of the data.
"""
raise NotImplementedError
# @abstractmethod
# def distance(self, data_one: Any, data_two: Any) -> float:
# """
# Calculates the distance between two elements of the data.
# """
# raise NotImplementedError


class RandomSplitter(AbstractSplitter):
"""
This splitter randomly splits the data.
"""

def __init__(self, seed: float = 0) -> None:
super().__init__(seed=seed)
def __init__(self) -> None:
super().__init__()

def split(self, length_of_data: int, split: tuple) -> list | list | list:
def get_split_indexes(self, length_of_data: int, split: list, seed: float = None) -> list | list | list:
"""
Randomly splits the data in three lists according to the split tuple, the split tuple should contain two values between 0 and 1 in an ascending manner.
Instead of returning the original data, returns three lists of indexes mapping to the indexes in the original data.
Splits the data indices into train, validation, and test sets.
One can use these lists of indices to parse the data afterwards.

args:
length_of_data: int
The length of the data.
split: list
The proportions for [train, validation, test] splits.
seed: float
The seed for reproducibility.
returns:
train: list
The indices for the training set.
validation: list
The indices for the validation set.
test: list
The indices for the test set.
"""
if split[0] >= split[1]:
raise ValueError("The split tuple should contain two values between 0 and 1 in an ascending manner.")
train, test, validation = [], [], []
for i in range(length_of_data):
r = np.random.rand()
if r < split[0]:
train.append(i)
elif r < split[1]:
test.append(i)
else:
validation.append(i)
return train, test, validation
if len(split) != 3:
raise ValueError("The split argument should be a list with length 3 that contains the proportions for [train, validation, test] splits.")
if sum(split) != 1.0:
raise ValueError("The sum of the split proportions should be 1.")

# Generate a list of indices and shuffle it
indices = np.arange(length_of_data)
np.random.seed(seed)
np.random.shuffle(indices)

# Calculate the sizes of the train, validation, and test sets
train_size = int(split[0] * length_of_data)
validation_size = int(split[1] * length_of_data)

# Split the shuffled indices according to the calculated sizes
train = indices[:train_size].tolist()
validation = indices[train_size:train_size+validation_size].tolist()
test = indices[train_size+validation_size:].tolist()

return train, validation, test
48 changes: 33 additions & 15 deletions bin/tests/test_csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import os
import unittest
import sys
sys.path.append('./')
from bin.src.data.csv import CsvProcessing, CsvLoader
from bin.src.data.experiments import DnaToFloatExperiment,ProtDnaToFloatExperiment
from bin.src.data.experiments import DnaToFloatExperiment, ProtDnaToFloatExperiment
Expand All @@ -9,12 +11,27 @@ class AbstractTestCsvProcessing(unittest.TestCase):
"""
Abstract class for testing CsvProcessing class.
"""
def _test_data_shape(self):
def _test_len(self):
"""
It tests that it loads correctly the data with the correct shape.
"""
self.assertEqual(self.csv_processing.data.shape[0], self.data_shape[0])
self.assertEqual(self.csv_processing.data.shape[1], self.data_shape[1])
self.assertEqual(len(self.csv_processing.data), self.data_length)

def _add_split(self):
config = self.configs['split']
self.csv_processing.add_split(
split_method = config['name'],
split = config['params']['split'],
seed = config['params']['seed']
)

def _test_random_splitter(self, expected_splits):
"""
It tests that the data is split correctly.
"""
for i in range(self.data_length):
col = self.csv_processing.data['split:split:int'][i]
self.assertEqual(self.csv_processing.data['split:split:int'][i], expected_splits[i])

def _add_noise(self):
self.csv_processing.add_noise(self.configs['noise'])
Expand All @@ -27,9 +44,6 @@ def _test_value_from_column(self, column_name, expected_value, position=0):
if isinstance(observed_value, float):
observed_value = round(observed_value, 2)
self.assertEqual(observed_value, expected_value)

def _test_split(self):
pass


class TestDnaToFloatCsvProcessing(AbstractTestCsvProcessing):
Expand All @@ -42,14 +56,16 @@ def setUp(self):
self.csv_processing = CsvProcessing(self.experiment, self.csv_path)
with open('bin/tests/test_data/dna_experiment/test_config.json', 'rb') as f:
self.configs = json.load(f)
self.data_length = 2

def test_data_shape(self):
self.data_shape = [2,3]
self._test_data_shape()

def test_add_noise(self):
def test_len(self):
self._test_len()

def test_split_and_noise(self):
self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATGC')
self._test_value_from_column('hola:label:float', 12)
self._add_split()
self._test_random_splitter([1, 0])
self._add_noise()
self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATNN')
self._test_value_from_column('hola:label:float', 12.68)
Expand All @@ -66,15 +82,17 @@ def setUp(self):
self.csv_processing = CsvProcessing(self.experiment, self.csv_path)
with open('bin/tests/test_data/prot_dna_experiment/test_config.json', 'rb') as f:
self.configs = json.load(f)
self.data_length = 2

def test_load_csv(self):
self.data_shape = [2,4]
self._test_data_shape()
def test_len(self):
self._test_len()

def test_add_noise(self):
def test_split_and_noise(self):
self._test_value_from_column('bonjour:input:prot', 'GPRTTIKAKQLETLK')
self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATGC')
self._test_value_from_column('hola:label:float', 12)
self._add_split()
self._test_random_splitter([1, 0])
self._add_noise()
self._test_value_from_column('bonjour:input:prot', 'GPRTTIKAKQLETLX')
self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATNN')
Expand Down
8 changes: 3 additions & 5 deletions bin/tests/test_data/dna_experiment/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@
"params": {"mean": 0.5, "std": 0.1, "seed": 0}
}
],
"split": [
{
"split": {
"name": "RandomSplitter",
"params": {"split": [0.6, 0.8], "seed": 0}
}
]
"params": {"split": [0.5, 0.5, 0.0], "seed": 0}
}
}
8 changes: 3 additions & 5 deletions bin/tests/test_data/prot_dna_experiment/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
"params": {"mean": 0.5, "std": 0.1, "seed": 0}
}
],
"split": [
{
"split": {
"name": "RandomSplitter",
"params": {"split": [0.6, 0.8], "seed": 0}
}
]
"params": {"split": [0.5, 0.5, 0], "seed": 0}
}
}
10 changes: 8 additions & 2 deletions examples/pipeline_generated.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
"experiment": "DnaToFloatExperiment",
"noise": [
{
"column_name": "inphello:input1:dnaut1",

"column_name": "hello:input:dna",

"name": "UniformTextMasker",
"params": {"probability": 0.1}
},
{
"column_name": "hello:input2:prot",

"column_name": "bonjour:input:prot",

"name": "UniformTextMasker",
"params": {"probability": 0.4}
},
Expand All @@ -17,6 +21,7 @@
"params": {"mean": 0.5, "std": 0.1}
}
],

"split":
{
"name": "RandomSplitter",
Expand All @@ -26,4 +31,5 @@


}

}
Loading