22
22
get_latest_mwf ,
23
23
get_file_set ,
24
24
get_library_from_file_set ,
25
+ get_library_preparation_from_library ,
25
26
get_sample_from_library ,
26
27
get_mwfr_file_input_arg ,
27
28
get_mwfr_parameter_input_arg ,
30
31
# MetaWorkflow names are used to get the latest version.
31
32
# We assume that they don't change!
32
33
MWF_NAME_ILLUMINA = "Illumina_alignment_GRCh38"
34
+ MWF_NAME_RNASEQ = "RNA-seq_bulk_short_reads_GRCh38"
33
35
MWF_NAME_ONT = "ONT_alignment_GRCh38"
34
36
MWF_NAME_PACBIO = "PacBio_alignment_GRCh38"
35
37
MWF_NAME_HIC = "Hi-C_alignment_GRCh38"
36
38
MWF_NAME_FASTQC = "Illumina_FASTQ_quality_metrics"
37
39
MWF_NAME_FASTQ_LONG_READ = "long_reads_FASTQ_quality_metrics"
38
40
MWF_NAME_FASTQ_SHORT_READ = "short_reads_FASTQ_quality_metrics"
39
41
MWF_NAME_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end"
42
+ MWF_NAME_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end"
40
43
MWF_NAME_BAMQC_SHORT_READ = "paired-end_short_reads_BAM_quality_metrics_GRCh38"
41
44
MWF_NAME_ULTRA_LONG_BAMQC = "ultra-long_reads_BAM_quality_metrics_GRCh38"
42
45
MWF_NAME_LONG_READ_BAMQC = "long_reads_BAM_quality_metrics_GRCh38"
52
55
SAMPLE_NAME = "sample_name"
53
56
LENGTH_REQUIRED = "length_required"
54
57
LIBRARY_ID = "library_id"
58
+ GENOME_REFERENCE_STAR = "genome_reference_star"
59
+ IS_STRANDED = "is_stranded"
60
+ STRANDEDNESS = "strandedness"
55
61
56
62
# Schema fields
57
63
COMMON_FIELDS = "common_fields"
64
70
ACCESSION = "accession"
65
71
ALIASES = "aliases"
66
72
UPLOADED = "uploaded"
73
+ FIRST_STRANDED = "First Stranded"
74
+ SECOND_STRANDED = "Second Stranded"
67
75
68
76
69
77
################################################
@@ -88,6 +96,56 @@ def mwfr_illumina_alignment(fileset_accession, length_required, smaht_key):
88
96
)
89
97
90
98
99
+ def mwfr_rnaseq_alignment (fileset_accession , sequence_length , smaht_key ):
100
+ """Creates a MetaWorflowRun item in the portal for RNA-Seq alignment of submitted files within a fileset"""
101
+
102
+ mwf = get_latest_mwf (MWF_NAME_RNASEQ , smaht_key )
103
+ print (f"Using MetaWorkflow { mwf [ACCESSION ]} ({ mwf [ALIASES ][0 ]} )" )
104
+
105
+ file_set = get_file_set (fileset_accession , smaht_key )
106
+ mwfr_input = get_core_alignment_mwfr_input_from_readpairs (
107
+ file_set , INPUT_FILES_R1_FASTQ_GZ , INPUT_FILES_R2_FASTQ_GZ , smaht_key
108
+ )
109
+ # RNA-Seq specific input
110
+ genome_reference_star_alias = f"smaht:ReferenceFile-star-index-no-alt-no-hla-gencode45-oh{ sequence_length - 1 } _GCA_000001405.15_GRCh38_no_decoy"
111
+ search_reference_file = "?type=File" f"&aliases={ genome_reference_star_alias } "
112
+ reference_files = ff_utils .search_metadata (
113
+ f"/search/{ search_reference_file } " , key = smaht_key
114
+ )
115
+ if len (reference_files ) != 1 :
116
+ raise Exception (
117
+ f"Did not find exactly one genome_reference_star reference file. Search was: { search_reference_file } "
118
+ )
119
+ genome_reference_star_file = reference_files [0 ]
120
+
121
+ mwfr_input .append (
122
+ get_mwfr_file_input_arg (
123
+ GENOME_REFERENCE_STAR ,
124
+ [{"file" : genome_reference_star_file [UUID ]}],
125
+ )
126
+ )
127
+
128
+ # Get strandedness info
129
+ library = get_library_from_file_set (file_set , smaht_key )
130
+ library_preparation = get_library_preparation_from_library (library , smaht_key )
131
+ strand = library_preparation ["strand" ]
132
+
133
+ if strand in [FIRST_STRANDED , SECOND_STRANDED ]:
134
+ strandedness_mapping = {FIRST_STRANDED : "rf" , SECOND_STRANDED : "fr" }
135
+ mwfr_input .extend (
136
+ [
137
+ get_mwfr_parameter_input_arg (IS_STRANDED , "true" ),
138
+ get_mwfr_parameter_input_arg (
139
+ STRANDEDNESS , strandedness_mapping [strand ]
140
+ ),
141
+ ]
142
+ )
143
+
144
+ create_and_post_mwfr (
145
+ mwf [UUID ], file_set , INPUT_FILES_R1_FASTQ_GZ , mwfr_input , smaht_key
146
+ )
147
+
148
+
91
149
def mwfr_pacbio_alignment (fileset_accession , smaht_key ):
92
150
"""Creates a MetaWorflowRun item in the portal for PacBio alignment of submitted files within a fileset"""
93
151
@@ -196,19 +254,51 @@ def mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key):
196
254
get_mwfr_file_input_arg (INPUT_FILES_CRAM , crams ),
197
255
get_mwfr_file_input_arg (GENOME_REFERENCE_FASTA , reference_genome ),
198
256
]
199
- # pprint.pprint(mwfr_input)
200
257
create_and_post_mwfr (mwf [UUID ], file_set , INPUT_FILES_CRAM , mwfr_input , smaht_key )
201
258
202
259
260
+ def mwfr_bam_to_fastq_paired_end (fileset_accession , smaht_key ):
261
+ file_set = get_file_set (fileset_accession , smaht_key )
262
+ mwf = get_latest_mwf (MWF_NAME_BAM_TO_FASTQ_PAIRED_END , smaht_key )
263
+ print (f"Using MetaWorkflow { mwf [ACCESSION ]} ({ mwf [ALIASES ][0 ]} )" )
264
+
265
+ # Get submitted CRAMs in fileset (can be aligned or unaligned)
266
+ search_filter = (
267
+ f"?file_sets.uuid={ file_set [UUID ]} "
268
+ "&type=SubmittedFile"
269
+ "&file_format.display_title=bam"
270
+ )
271
+ files_to_run = ff_utils .search_metadata ((f"search/{ search_filter } " ), key = smaht_key )
272
+ files_to_run .reverse ()
273
+
274
+ if len (files_to_run ) == 0 :
275
+ print (f"No files to run for search { search_filter } " )
276
+ return
277
+
278
+ bams = []
279
+ for dim , file in enumerate (files_to_run ):
280
+ bams .append ({"file" : file [UUID ], "dimension" : f"{ dim } " })
281
+
282
+ mwfr_input = [
283
+ get_mwfr_file_input_arg (INPUT_FILES_BAM , bams ),
284
+ ]
285
+ create_and_post_mwfr (mwf [UUID ], file_set , INPUT_FILES_BAM , mwfr_input , smaht_key )
286
+
287
+
203
288
################################################
204
289
# QC MetaWorkflowRuns
205
290
################################################
206
291
207
292
208
- def mwfr_fastqc (fileset_accession , smaht_key ):
293
+ def mwfr_fastqc (fileset_accession , check_lanes , smaht_key ):
209
294
210
295
file_set = get_file_set (fileset_accession , smaht_key )
211
- mwf = get_latest_mwf (MWF_NAME_FASTQC , smaht_key )
296
+ if check_lanes :
297
+ print (f"Using MetaWorkflow { MWF_NAME_FASTQC } " )
298
+ mwf = get_latest_mwf (MWF_NAME_FASTQC , smaht_key )
299
+ else :
300
+ print (f"Using MetaWorkflow { MWF_NAME_FASTQ_SHORT_READ } " )
301
+ mwf = get_latest_mwf (MWF_NAME_FASTQ_SHORT_READ , smaht_key )
212
302
print (f"Using MetaWorkflow { mwf [ACCESSION ]} ({ mwf [ALIASES ][0 ]} )" )
213
303
214
304
# Get unaligned R2 reads in the fileset that don't have already QC
@@ -245,14 +335,19 @@ def mwfr_fastqc(fileset_accession, smaht_key):
245
335
246
336
files_input_list = sorted (files_input , key = lambda x : x ["dimension" ])
247
337
338
+ # If we are not running the Illumina MWF, reformat to 1D list
339
+ if not check_lanes :
340
+ for i , inp in enumerate (files_input_list ):
341
+ inp ["dimension" ] = f"{ i } "
342
+
248
343
mwfr_input = [get_mwfr_file_input_arg (INPUT_FILES_FASTQ_GZ , files_input_list )]
249
344
create_and_post_mwfr (
250
345
mwf [UUID ], file_set , INPUT_FILES_FASTQ_GZ , mwfr_input , smaht_key
251
346
)
252
347
253
348
254
349
def mwfr_ubam_qc_long_read (fileset_accession , smaht_key ):
255
-
350
+
256
351
file_set = get_file_set (fileset_accession , smaht_key )
257
352
mwf = get_latest_mwf (MWF_NAME_FASTQ_LONG_READ , smaht_key )
258
353
print (f"Using MetaWorkflow { mwf [ACCESSION ]} ({ mwf [ALIASES ][0 ]} )" )
@@ -391,9 +486,7 @@ def get_core_alignment_mwfr_input(file_set, file_input_arg, smaht_key):
391
486
sample = get_sample_from_library (library , smaht_key )
392
487
393
488
search_filter = (
394
- "?type=UnalignedReads"
395
- f"&file_sets.uuid={ file_set [UUID ]} "
396
- f"&status={ UPLOADED } "
489
+ "?type=UnalignedReads" f"&file_sets.uuid={ file_set [UUID ]} " f"&status={ UPLOADED } "
397
490
)
398
491
search_result = ff_utils .search_metadata (f"/search/{ search_filter } " , key = smaht_key )
399
492
search_result .reverse ()
0 commit comments