Skip to content

Commit 0037fe0

Browse files
Add RNA-Seq MWFR
1 parent 083cbe9 commit 0037fe0

File tree

7 files changed

+193
-18
lines changed

7 files changed

+193
-18
lines changed

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
Change Log
44
==========
55

6+
3.7.0
7+
=====
8+
* Add support for BAM2FASTQ MetaWorkflowRun
9+
* Add RNA-Seq alignment MetaWorkflowRun
10+
11+
12+
613
3.6.2
714
=====
815
* Add short_reads_FASTQ_quality_metrics MetaWorkflowRun

magma_smaht/commands/create_meta_workflow_run.py

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22

33
from magma_smaht.create_metawfr import (
44
mwfr_illumina_alignment,
5+
mwfr_rnaseq_alignment,
56
mwfr_pacbio_alignment,
67
mwfr_fastqc,
78
mwfr_hic_alignment,
89
mwfr_ont_alignment,
910
mwfr_cram_to_fastq_paired_end,
11+
mwfr_bam_to_fastq_paired_end,
1012
mwfr_bamqc_short_read,
1113
mwfr_ubam_qc_long_read,
1214
mwfr_ultra_long_bamqc,
1315
mwfr_long_read_bamqc,
14-
mwfr_short_read_fastqc
16+
mwfr_short_read_fastqc,
1517
)
1618
from magma_smaht.utils import get_auth_key
1719

@@ -48,6 +50,31 @@ def align_illumina(fileset_accession, length_required, auth_env):
4850
mwfr_illumina_alignment(fileset_accession, length_required, smaht_key)
4951

5052

53+
@cli.command()
54+
@click.help_option("--help", "-h")
55+
@click.option(
56+
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
57+
)
58+
@click.option(
59+
"-l",
60+
"--sequence-length",
61+
required=True,
62+
type=int,
63+
help="Sequence length (can be obtained from FastQC output)",
64+
)
65+
@click.option(
66+
"-e",
67+
"--auth-env",
68+
required=True,
69+
type=str,
70+
help="Name of environment in smaht-keys file",
71+
)
72+
def align_rnaseq(fileset_accession, sequence_length, auth_env):
73+
"""Alignment MWFR for RNA-Seq data"""
74+
smaht_key = get_auth_key(auth_env)
75+
mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key)
76+
77+
5178
@cli.command()
5279
@click.help_option("--help", "-h")
5380
@click.option(
@@ -107,24 +134,30 @@ def align_ont(fileset_accession, auth_env):
107134
@click.option(
108135
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
109136
)
137+
@click.option(
138+
"-c",
139+
"--check-lanes",
140+
required=True,
141+
default=True,
142+
type=bool,
143+
help="Wether to check lanes or not (different MWFs)",
144+
)
110145
@click.option(
111146
"-e",
112147
"--auth-env",
113148
required=True,
114149
type=str,
115150
help="Name of environment in smaht-keys file",
116151
)
117-
def qc_short_read_fastq_illumina(fileset_accession, auth_env):
152+
def qc_short_read_fastq_illumina(fileset_accession, check_lanes, auth_env):
118153
"""QC MWFR for paired short-read Illumina FASTQs"""
119154
smaht_key = get_auth_key(auth_env)
120-
mwfr_fastqc(fileset_accession, smaht_key)
155+
mwfr_fastqc(fileset_accession, check_lanes, smaht_key)
121156

122157

123158
@cli.command()
124159
@click.help_option("--help", "-h")
125-
@click.option(
126-
"-f", "--file-accession", required=True, type=str, help="File accession"
127-
)
160+
@click.option("-f", "--file-accession", required=True, type=str, help="File accession")
128161
@click.option(
129162
"-e",
130163
"--auth-env",
@@ -222,5 +255,23 @@ def conversion_cram_to_fastq(fileset_accession, auth_env):
222255
mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key)
223256

224257

258+
@cli.command()
259+
@click.help_option("--help", "-h")
260+
@click.option(
261+
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
262+
)
263+
@click.option(
264+
"-e",
265+
"--auth-env",
266+
required=True,
267+
type=str,
268+
help="Name of environment in smaht-keys file",
269+
)
270+
def conversion_bam_to_fastq(fileset_accession, auth_env):
271+
"""Conversion MWFR for BAM to FASTQ (paired-end)"""
272+
smaht_key = get_auth_key(auth_env)
273+
mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key)
274+
275+
225276
if __name__ == "__main__":
226277
cli()

magma_smaht/commands/wrangler_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def cli():
2626
type=str,
2727
help="Name of environment in smaht-keys file",
2828
)
29-
def cram2fastq_out_to_fileset(mwfr_identifier, auth_env):
30-
"""Associate CRAM2FASTQ output with fileset"""
29+
def associate_conversion_output_with_fileset(mwfr_identifier, auth_env):
30+
"""Associate CRAM2FASTQ or BAM2FASTQ output with fileset"""
3131
smaht_key = get_auth_key(auth_env)
3232
wrangler_utils.associate_conversion_output_with_fileset(mwfr_identifier, smaht_key)
3333

magma_smaht/create_metawfr.py

Lines changed: 100 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
get_latest_mwf,
2323
get_file_set,
2424
get_library_from_file_set,
25+
get_library_preparation_from_library,
2526
get_sample_from_library,
2627
get_mwfr_file_input_arg,
2728
get_mwfr_parameter_input_arg,
@@ -30,13 +31,15 @@
3031
# MetaWorkflow names are used to get the latest version.
3132
# We assume that they don't change!
3233
MWF_NAME_ILLUMINA = "Illumina_alignment_GRCh38"
34+
MWF_NAME_RNASEQ = "RNA-seq_bulk_short_reads_GRCh38"
3335
MWF_NAME_ONT = "ONT_alignment_GRCh38"
3436
MWF_NAME_PACBIO = "PacBio_alignment_GRCh38"
3537
MWF_NAME_HIC = "Hi-C_alignment_GRCh38"
3638
MWF_NAME_FASTQC = "Illumina_FASTQ_quality_metrics"
3739
MWF_NAME_FASTQ_LONG_READ = "long_reads_FASTQ_quality_metrics"
3840
MWF_NAME_FASTQ_SHORT_READ = "short_reads_FASTQ_quality_metrics"
3941
MWF_NAME_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end"
42+
MWF_NAME_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end"
4043
MWF_NAME_BAMQC_SHORT_READ = "paired-end_short_reads_BAM_quality_metrics_GRCh38"
4144
MWF_NAME_ULTRA_LONG_BAMQC = "ultra-long_reads_BAM_quality_metrics_GRCh38"
4245
MWF_NAME_LONG_READ_BAMQC = "long_reads_BAM_quality_metrics_GRCh38"
@@ -52,6 +55,9 @@
5255
SAMPLE_NAME = "sample_name"
5356
LENGTH_REQUIRED = "length_required"
5457
LIBRARY_ID = "library_id"
58+
GENOME_REFERENCE_STAR = "genome_reference_star"
59+
IS_STRANDED = "is_stranded"
60+
STRANDEDNESS = "strandedness"
5561

5662
# Schema fields
5763
COMMON_FIELDS = "common_fields"
@@ -64,6 +70,8 @@
6470
ACCESSION = "accession"
6571
ALIASES = "aliases"
6672
UPLOADED = "uploaded"
73+
FIRST_STRANDED = "First Stranded"
74+
SECOND_STRANDED = "Second Stranded"
6775

6876

6977
################################################
@@ -88,6 +96,56 @@ def mwfr_illumina_alignment(fileset_accession, length_required, smaht_key):
8896
)
8997

9098

99+
def mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key):
100+
"""Creates a MetaWorflowRun item in the portal for RNA-Seq alignment of submitted files within a fileset"""
101+
102+
mwf = get_latest_mwf(MWF_NAME_RNASEQ, smaht_key)
103+
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")
104+
105+
file_set = get_file_set(fileset_accession, smaht_key)
106+
mwfr_input = get_core_alignment_mwfr_input_from_readpairs(
107+
file_set, INPUT_FILES_R1_FASTQ_GZ, INPUT_FILES_R2_FASTQ_GZ, smaht_key
108+
)
109+
# RNA-Seq specific input
110+
genome_reference_star_alias = f"smaht:ReferenceFile-star-index-no-alt-no-hla-gencode45-oh{sequence_length-1}_GCA_000001405.15_GRCh38_no_decoy"
111+
search_reference_file = "?type=File" f"&aliases={genome_reference_star_alias}"
112+
reference_files = ff_utils.search_metadata(
113+
f"/search/{search_reference_file}", key=smaht_key
114+
)
115+
if len(reference_files) != 1:
116+
raise Exception(
117+
f"Did not find exactly one genome_reference_star reference file. Search was: {search_reference_file}"
118+
)
119+
genome_reference_star_file = reference_files[0]
120+
121+
mwfr_input.append(
122+
get_mwfr_file_input_arg(
123+
GENOME_REFERENCE_STAR,
124+
[{"file": genome_reference_star_file[UUID]}],
125+
)
126+
)
127+
128+
# Get strandedness info
129+
library = get_library_from_file_set(file_set, smaht_key)
130+
library_preparation = get_library_preparation_from_library(library, smaht_key)
131+
strand = library_preparation["strand"]
132+
133+
if strand in [FIRST_STRANDED, SECOND_STRANDED]:
134+
strandedness_mapping = {FIRST_STRANDED: "rf", SECOND_STRANDED: "fr"}
135+
mwfr_input.extend(
136+
[
137+
get_mwfr_parameter_input_arg(IS_STRANDED, "true"),
138+
get_mwfr_parameter_input_arg(
139+
STRANDEDNESS, strandedness_mapping[strand]
140+
),
141+
]
142+
)
143+
144+
create_and_post_mwfr(
145+
mwf[UUID], file_set, INPUT_FILES_R1_FASTQ_GZ, mwfr_input, smaht_key
146+
)
147+
148+
91149
def mwfr_pacbio_alignment(fileset_accession, smaht_key):
92150
"""Creates a MetaWorflowRun item in the portal for PacBio alignment of submitted files within a fileset"""
93151

@@ -196,19 +254,51 @@ def mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key):
196254
get_mwfr_file_input_arg(INPUT_FILES_CRAM, crams),
197255
get_mwfr_file_input_arg(GENOME_REFERENCE_FASTA, reference_genome),
198256
]
199-
# pprint.pprint(mwfr_input)
200257
create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_CRAM, mwfr_input, smaht_key)
201258

202259

260+
def mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key):
261+
file_set = get_file_set(fileset_accession, smaht_key)
262+
mwf = get_latest_mwf(MWF_NAME_BAM_TO_FASTQ_PAIRED_END, smaht_key)
263+
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")
264+
265+
# Get submitted CRAMs in fileset (can be aligned or unaligned)
266+
search_filter = (
267+
f"?file_sets.uuid={file_set[UUID]}"
268+
"&type=SubmittedFile"
269+
"&file_format.display_title=bam"
270+
)
271+
files_to_run = ff_utils.search_metadata((f"search/{search_filter}"), key=smaht_key)
272+
files_to_run.reverse()
273+
274+
if len(files_to_run) == 0:
275+
print(f"No files to run for search {search_filter}")
276+
return
277+
278+
bams = []
279+
for dim, file in enumerate(files_to_run):
280+
bams.append({"file": file[UUID], "dimension": f"{dim}"})
281+
282+
mwfr_input = [
283+
get_mwfr_file_input_arg(INPUT_FILES_BAM, bams),
284+
]
285+
create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_BAM, mwfr_input, smaht_key)
286+
287+
203288
################################################
204289
# QC MetaWorkflowRuns
205290
################################################
206291

207292

208-
def mwfr_fastqc(fileset_accession, smaht_key):
293+
def mwfr_fastqc(fileset_accession, check_lanes, smaht_key):
209294

210295
file_set = get_file_set(fileset_accession, smaht_key)
211-
mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key)
296+
if check_lanes:
297+
print(f"Using MetaWorkflow {MWF_NAME_FASTQC}")
298+
mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key)
299+
else:
300+
print(f"Using MetaWorkflow {MWF_NAME_FASTQ_SHORT_READ}")
301+
mwf = get_latest_mwf(MWF_NAME_FASTQ_SHORT_READ, smaht_key)
212302
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")
213303

214304
# Get unaligned R2 reads in the fileset that don't have already QC
@@ -245,14 +335,19 @@ def mwfr_fastqc(fileset_accession, smaht_key):
245335

246336
files_input_list = sorted(files_input, key=lambda x: x["dimension"])
247337

338+
# If we are not running the Illumina MWF, reformat to 1D list
339+
if not check_lanes:
340+
for i, inp in enumerate(files_input_list):
341+
inp["dimension"] = f"{i}"
342+
248343
mwfr_input = [get_mwfr_file_input_arg(INPUT_FILES_FASTQ_GZ, files_input_list)]
249344
create_and_post_mwfr(
250345
mwf[UUID], file_set, INPUT_FILES_FASTQ_GZ, mwfr_input, smaht_key
251346
)
252347

253348

254349
def mwfr_ubam_qc_long_read(fileset_accession, smaht_key):
255-
350+
256351
file_set = get_file_set(fileset_accession, smaht_key)
257352
mwf = get_latest_mwf(MWF_NAME_FASTQ_LONG_READ, smaht_key)
258353
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")
@@ -391,9 +486,7 @@ def get_core_alignment_mwfr_input(file_set, file_input_arg, smaht_key):
391486
sample = get_sample_from_library(library, smaht_key)
392487

393488
search_filter = (
394-
"?type=UnalignedReads"
395-
f"&file_sets.uuid={file_set[UUID]}"
396-
f"&status={UPLOADED}"
489+
"?type=UnalignedReads" f"&file_sets.uuid={file_set[UUID]}" f"&status={UPLOADED}"
397490
)
398491
search_result = ff_utils.search_metadata(f"/search/{search_filter}", key=smaht_key)
399492
search_result.reverse()

magma_smaht/utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,29 @@ def get_sample_from_library(library, smaht_key):
201201
return sample
202202

203203

204+
def get_library_preparation_from_library(library, smaht_key):
205+
"""Get the library preparation that is associated with a library
206+
207+
Args:
208+
library (dict): library item from portal
209+
smaht_key (dict): SMaHT key
210+
211+
Raises:
212+
Exception: Raises an exception when there is no library preparation
213+
214+
Returns:
215+
dict: library_preparation item from portal
216+
"""
217+
library_preparation = library.get("library_preparation")
218+
if not library_preparation:
219+
raise Exception(f"No library preparation found for library {library['accession']}")
220+
221+
library_preparation_item = ff_utils.get_metadata(
222+
library_preparation, add_on="frame=raw&datastore=database", key=smaht_key
223+
)
224+
return library_preparation_item
225+
226+
204227
def get_latest_mwf(mwf_name, smaht_key):
205228
"""Get the latest version of the MWF with name `mwf_name`
206229

magma_smaht/wrangler_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
JsonObject = Dict[str, Any]
1515

1616
WF_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end"
17+
WF_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end"
1718

18-
SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END]
19+
SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END]
1920

2021
# Portal constants
2122
COMPLETED = "completed"
@@ -48,7 +49,7 @@ def associate_conversion_output_with_fileset(
4849

4950
for wfr in mwfr_meta["workflow_runs"]:
5051
output = wfr["output"]
51-
if wfr["name"] != WF_CRAM_TO_FASTQ_PAIRED_END:
52+
if wfr["name"] not in [WF_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END]:
5253
continue
5354
if len(output) != 2:
5455
print_error_and_exit(f"Expected exactly 2 output files in WorkflowRun")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "magma-suite"
3-
version = "3.6.2"
3+
version = "3.7.0"
44
description = "Collection of tools to manage meta-workflows automation."
55
authors = ["Michele Berselli <[email protected]>", "Doug Rioux", "Soo Lee", "CGAP team"]
66
license = "MIT"

0 commit comments

Comments
 (0)