Skip to content

Commit 788cbd7

Browse files
Merge pull request #27 from mathysgrapotte/pipeline_integration
Pipeline integration
2 parents 984b3f0 + a61484b commit 788cbd7

File tree

16 files changed

+602
-1
lines changed

16 files changed

+602
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ bin/**/__pycache__/
66
/notebook/.ipynb_checkpoints
77
.ipynb_checkpoints
88
.nextflow*
9-
works
9+
/work/
1010
.DS_Store
1111
/singularity_cache
1212
/results/
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# python 3.11.8-slim-bullseye
2+
FROM python@sha256:a2d01031695ff170831430810ee30dd06d8413b08f72ad978b43fd10daa6b86e
3+
LABEL maintainer="Alessio Vignoli" \
4+
name="alessiovignoli3/stimulus:torch_scikit_numpy" \
5+
description="Docker image containing python packages required for model-check pipeline"
6+
7+
# installing python needed packages
8+
RUN pip install \
9+
numpy==1.26.0 \
10+
pytorch-lightning==2.0.1 \
11+
scikit-learn==1.3.0

bin/json_schema.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
2+
from abc import ABC, abstractmethod
3+
from typing import Literal
4+
5+
class JsonSchema(ABC):
6+
"""
7+
This class helps decode and work on a difened Json schema used by the stimulus pipeline
8+
"""
9+
def __init__(self, schema: dict ) -> None:
10+
self.schema = schema
11+
self.noise_arg = schema.get('noise', [])
12+
self.split_arg = schema.get('split', [])
13+
self.custom_arg = schema.get('custom', [])
14+
15+
# check that both noise and split have they're coherent number of parameters values
16+
self.number_noise_val = self._check_params_schema('noise')
17+
self.number_split_val = self._check_params_schema('split')
18+
19+
20+
def _check_params_schema(self, switch: Literal['noise', 'split']) -> int:
21+
"""
22+
Help function to check if the number of values in params in the noise dictionary is consisten among all params.
23+
If there is {"Noisernmae" : { "params": [{"val1":[0, 1]}], "OtherNoiser" : { "params": [{"val1":[2, 3], "val3":[4]}]}}
24+
it will raise error because the val3 has only a list of len() 1 instead of 2
25+
otherwise it resturn the len()
26+
"""
27+
28+
starting_list = self.noise_arg
29+
if switch == 'split':
30+
starting_list = self.split_arg
31+
32+
# in case there is no noise or split flag but a custom one instead
33+
if not starting_list and self.custom_arg:
34+
return None
35+
36+
num_params_list = []
37+
# Iterate through the given dictionary becuse more than one noising function could be specified for ex.
38+
for i, dictionary in enumerate(starting_list):
39+
40+
# take into account that there could be the keyword default
41+
if dictionary["params"] == "default":
42+
# TODO think what to do in this case
43+
continue
44+
45+
# iterate throught the possible multiple parmaeters
46+
else:
47+
for params_flag, params_list in dictionary["params"][0].items():
48+
num_params_list.append(len(params_list))
49+
50+
# check that all parameters values found are equal
51+
if len(set(num_params_list)) == 1:
52+
return num_params_list[0]
53+
else:
54+
raise ValueError(f"Expected the same number of values for all the params under {switch} flag, but received a discordant ammount input Json.")
55+
56+
57+
58+

bin/launch_csv_handling.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
from src.data.csv_parser import CSVParser
5+
import src.data.experiments as exp
6+
7+
8+
def get_args():
9+
10+
"get the arguments when using from the commandline"
11+
12+
parser = argparse.ArgumentParser(description="")
13+
parser.add_argument("-c", "--csv", type=str, required=True, metavar="FILE", help='The file path for the csv containing all data')
14+
parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')
15+
16+
args = parser.parse_args()
17+
return args
18+
19+
20+
21+
22+
def main(data_csv, config_json):
23+
24+
print(data_csv, config_json)
25+
26+
27+
"""
28+
experiment = exp.eval(json[experiment_name])
29+
data = CsvHandler( data_csv, experiment )
30+
data.noise(json[params])
31+
data.split(jason[splitparamates])
32+
"""
33+
34+
35+
36+
if __name__ == "__main__":
37+
args = get_args()
38+
main(args.csv, args.json)

bin/launch_interpret_json.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
from json_schema import JsonSchema
6+
7+
8+
def get_args():
9+
10+
"get the arguments when using from the commandline"
11+
12+
parser = argparse.ArgumentParser(description="")
13+
parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')
14+
15+
args = parser.parse_args()
16+
return args
17+
18+
19+
20+
21+
22+
23+
def interpret_json(input_json: dict) -> list:
24+
25+
# TODO handle no noise or splitter
26+
27+
# Initialize json schema
28+
schema = JsonSchema(input_json)
29+
30+
#print("\nnoise_configurations :\n", schema.noise_arg, "\n", type(schema.noise_arg))
31+
#print("\nsplit_configurations :\n", schema.split_arg, "\n", type(schema.split_arg))
32+
#print("\ncustom_configurations :\n", schema.custom_arg, "\n", type(schema.custom_arg))
33+
print(schema.number_noise_val, schema.number_split_val)
34+
35+
36+
37+
def main(config_json: str) -> str:
38+
39+
# open and read Json
40+
config = {}
41+
with open(config_json, 'r') as in_json:
42+
config = json.load(in_json)
43+
44+
# initialize the json scheme class
45+
interpret_json(config)
46+
47+
48+
49+
50+
51+
if __name__ == "__main__":
52+
args = get_args()
53+
main(args.json)

configs/crg.config

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
params {
2+
config_profile_name = 'CRG profile'
3+
config_profile_description = 'Configuration to run on CRG cluster'
4+
5+
max_cpus = 64
6+
max_memory = 100.GB
7+
max_time = 48.h
8+
}
9+
10+
11+
process {
12+
executor = 'crg'
13+
maxRetries = params.max_retries
14+
errorStrategy = params.err_start
15+
16+
withLabel:process_low {
17+
queue = 'cn-el7,short-centos79'
18+
cpus = { check_max( 1 , 'cpus' ) }
19+
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
20+
time = { check_max( 1.h * task.attempt, 'time' ) }
21+
}
22+
withLabel:process_medium{
23+
queue = 'cn-el7,short-centos79'
24+
cpus = { check_max( 4 , 'cpus' ) }
25+
memory = { check_max( 10.GB * task.attempt, 'memory' ) }
26+
time = { check_max( 6.h * task.attempt, 'time' ) }
27+
}
28+
withLabel:process_medium_high {
29+
30+
queue = 'cn-el7'
31+
cpus = { check_max( 12 , 'cpus' ) }
32+
memory = { check_max( 50.GB * task.attempt, 'memory' ) }
33+
time = { check_max( 12.h * task.attempt, 'time' ) }
34+
35+
}
36+
}
37+
38+
39+
singularity {
40+
enabled = true
41+
cacheDir = 'singularity_cache'
42+
}

configs/local.config

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
params {
2+
config_profile_name = 'Local profile'
3+
config_profile_description = 'Configuration to run on local machine'
4+
5+
}
6+
7+
8+
process {
9+
maxRetries = params.max_retries
10+
errorStrategy = params.err_start
11+
12+
withLabel:process_low {
13+
cpus = { check_max( 1 , 'cpus' ) }
14+
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
15+
time = { check_max( 1.h * task.attempt, 'time' ) }
16+
}
17+
withLabel:process_medium{
18+
cpus = { check_max( 4 , 'cpus' ) }
19+
memory = { check_max( 10.GB * task.attempt, 'memory' ) }
20+
time = { check_max( 6.h * task.attempt, 'time' ) }
21+
}
22+
withLabel:process_medium_high {
23+
cpus = { check_max( 12 , 'cpus' ) }
24+
memory = { check_max( 50.GB * task.attempt, 'memory' ) }
25+
time = { check_max( 12.h * task.attempt, 'time' ) }
26+
}
27+
}
28+
29+
30+
docker {
31+
enabled = true
32+
cacheDir = 'docker_cache'
33+
34+
// the following line is a prototype to fix a warning on the usage of /tmp instead of /dev/shm by Ray tuner
35+
//runOptions = '--shm-size=1.84gb'
36+
}

configs/modules.config

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* config file for defining DSL2 per module options and publishing paths
2+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3+
Available keys to override module options:
4+
ext.args = Additional arguments appended to command in module.
5+
ext.args2 = Second set of arguments appended to command in module (multi-tool modules).
6+
ext.args3 = Third set of arguments appended to command in module (multi-tool modules).
7+
ext.prefix = File name prefix for output files.
8+
----------------------------------------------------------------------------------------
9+
*/
10+
11+
process {
12+
13+
withName: "GENERATE_FASTA|GENERATE_FROM_FASTA" {
14+
ext.args = { [ params.dna_seq_len ? "-sl ${params.dna_seq_len}" : '-sl 100',
15+
params.motif_tag ? "-t ${params.motif_tag}" : '-t 5',
16+
params.non_motif_tag ? "-u ${params.non_motif_tag}" : '-u 0',
17+
params.num_seq ? "-ns ${params.num_seq}" : '',
18+
params.motif_start ? "-p ${params.motif_start}" : ''
19+
].flatten().unique(false).join(' ').trim()
20+
}
21+
ext.prefix = { params.generated_fasta ? params.generated_fasta : null }
22+
23+
// the outdir has to be the one the user specify plus stuff that makes it run unique
24+
publishDir = [
25+
path: { "${params.outdir}/${workflow.runName}_" + "${workflow.start}".replaceAll('[-:]', '_').split('\\.')[0] },
26+
mode: params.publish_dir_mode,
27+
overwrite: true
28+
]
29+
}
30+
31+
32+
33+
}
34+

configs/test.config

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
params {
2+
config_profile_name = 'CRG profile'
3+
config_profile_description = 'Configuration to run on CRG cluster'
4+
5+
max_cpus = 64
6+
max_memory = 100.GB
7+
max_time = 48.h
8+
}
9+
10+
11+
process {
12+
executor = 'crg'
13+
queue = 'cn-el7'
14+
cpus = 1
15+
queueSize = 50
16+
memory = '6.GB'
17+
time = '1 h'
18+
}
19+
20+
21+
singularity {
22+
enabled = true
23+
cacheDir = 'singularity_cache'
24+
}

examples/pipeline_generated.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"experiment": "DnaToFloatExperiment",
3+
"noise": [
4+
{
5+
"column_name": "input1",
6+
"name": "UniformTextMasker",
7+
"params": [{"probability": [0.1]}]
8+
},
9+
{
10+
"column_name": "input2",
11+
"name": "UniformTextMasker",
12+
"params": [{"probability": [0.4]}]
13+
},
14+
{
15+
"column_name": "label",
16+
"name": "GaussianNoise",
17+
"params": [{"mean": [0.5], "std": [0.1]}]
18+
}
19+
],
20+
"split": [
21+
{
22+
"name": "RandomSplitter",
23+
"params": [{"split": [[0.6, 0.8]]}]
24+
}
25+
]
26+
}

0 commit comments

Comments
 (0)