From 5a32dbbb4eacb83b37738ad62a7448693a13b4c9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 15:50:55 -0500 Subject: [PATCH 1/7] First pass --- v03_pipeline/lib/misc/sv.py | 49 +++++++++++++++++++ v03_pipeline/lib/misc/sv_test.py | 11 +++++ v03_pipeline/lib/model/dataset_type.py | 4 ++ v03_pipeline/lib/model/definitions.py | 3 ++ .../write_remapped_and_subsetted_callset.py | 9 ++++ 5 files changed, 76 insertions(+) create mode 100644 v03_pipeline/lib/misc/sv.py create mode 100644 v03_pipeline/lib/misc/sv_test.py diff --git a/v03_pipeline/lib/misc/sv.py b/v03_pipeline/lib/misc/sv.py new file mode 100644 index 000000000..3ab709871 --- /dev/null +++ b/v03_pipeline/lib/misc/sv.py @@ -0,0 +1,49 @@ +import hail as hl + +from v03_pipeline.lib.annotations import sv +from v03_pipeline.lib.model import ReferenceGenome, Sex +from v03_pipeline.lib.pedigree import Family + + +def overwrite_male_non_par_calls( + mt: hl.MatrixTable, + families: set[Family], +) -> hl.MatrixTable: + male_sample_ids = { + s.sample_id for f in families for s in f.samples if s.sex == Sex.MALE + } + male_sample_ids = hl.set(male_sample_ids) or hl.empty_set(hl.str) + par_intervals = hl.array( + [ + i + for i in hl.get_reference(ReferenceGenome.GRCh38).par + if i.start.contig == ReferenceGenome.GRCh38.x_contig + ], + ) + # NB: making use of existing formatting_annotation_fns. + # We choose to annotate & drop here as the sample level + # fields are dropped by the time we format variants. + mt = mt.annotate_rows( + start_locus=sv.start_locus(mt), + end_locus=sv.end_locus(mt), + ) + mt = mt.annotate_entries( + GT=hl.if_else( + ( + male_sample_ids.contains(mt.s) + & hl.any( + par_intervals.map( + lambda i: i.overlaps( + hl.interval( + mt.start_locus, + mt.end_locus, + ), + ), + ), + ) + ), + hl.Call([1, 1], phased=False), + mt.GT, + ), + ) + return mt.drop('start_locus', 'end_locus') diff --git a/v03_pipeline/lib/misc/sv_test.py b/v03_pipeline/lib/misc/sv_test.py new file mode 100644 index 000000000..54c122144 --- /dev/null +++ b/v03_pipeline/lib/misc/sv_test.py @@ -0,0 +1,11 @@ +import unittest + +from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls +from v03_pipeline.lib.pedigree import Family, Sample + +TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' + + +class SVTest(unittest.TestCase): + def test_overwrite_male_non_par_calls(self) -> None: + pass diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 357d73c7b..55def58e7 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -387,3 +387,7 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: sv.info, ], }[self] + + @property + def overwrite_male_non_par_calls(self) -> None: + return self == DatasetType.SV diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index f0a8989c4..badc6e442 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -70,6 +70,9 @@ def mito_contig(self) -> str: ReferenceGenome.GRCh38: 'chrM', }[self] + def x_contig(self) -> str: + return 'X' if self == ReferenceGenome.GRCh37 else 'chrX' + def contig_recoding(self, include_mt: bool = False) -> dict[str, str]: recode = { ReferenceGenome.GRCh37: { diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index d84af1769..15111f7ab 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -15,6 +15,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples +from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls from v03_pipeline.lib.misc.validation import SeqrValidationError from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( @@ -174,6 +175,14 @@ def create_table(self) -> hl.MatrixTable: for field in mt.row_value: if field not in self.dataset_type.row_fields: mt = mt.drop(field) + if ( + FeatureFlag.overwrite_male_non_par_calls + and self.dataset_type.overwrite_male_non_par_calls + ): + overwrite_male_non_par_calls( + mt, + loadable_families, + ) return mt.select_globals( remap_pedigree_hash=remap_pedigree_hash( self.project_remap_paths[self.project_i], From d30d2f8b26439c764291db159779a4785f718dbb Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 16:54:34 -0500 Subject: [PATCH 2/7] Finish test --- v03_pipeline/lib/misc/sv.py | 8 ++-- v03_pipeline/lib/misc/sv_test.py | 52 ++++++++++++++++++++++++- v03_pipeline/lib/model/definitions.py | 1 + v03_pipeline/var/test/callsets/sv_1.vcf | 4 +- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/misc/sv.py b/v03_pipeline/lib/misc/sv.py index 3ab709871..5c219e9a8 100644 --- a/v03_pipeline/lib/misc/sv.py +++ b/v03_pipeline/lib/misc/sv.py @@ -1,8 +1,8 @@ import hail as hl from v03_pipeline.lib.annotations import sv +from v03_pipeline.lib.misc.pedigree import Family from v03_pipeline.lib.model import ReferenceGenome, Sex -from v03_pipeline.lib.pedigree import Family def overwrite_male_non_par_calls( @@ -10,9 +10,11 @@ def overwrite_male_non_par_calls( families: set[Family], ) -> hl.MatrixTable: male_sample_ids = { - s.sample_id for f in families for s in f.samples if s.sex == Sex.MALE + s.sample_id for f in families for s in f.samples.values() if s.sex == Sex.MALE } - male_sample_ids = hl.set(male_sample_ids) or hl.empty_set(hl.str) + male_sample_ids = ( + hl.set(male_sample_ids) if male_sample_ids else hl.empty_set(hl.str) + ) par_intervals = hl.array( [ i diff --git a/v03_pipeline/lib/misc/sv_test.py b/v03_pipeline/lib/misc/sv_test.py index 54c122144..62c67250a 100644 --- a/v03_pipeline/lib/misc/sv_test.py +++ b/v03_pipeline/lib/misc/sv_test.py @@ -1,11 +1,59 @@ import unittest +import hail as hl + +from v03_pipeline.lib.misc.io import import_callset, select_relevant_fields +from v03_pipeline.lib.misc.pedigree import Family, Sample +from v03_pipeline.lib.misc.sample_ids import subset_samples from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls -from v03_pipeline.lib.pedigree import Family, Sample +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, Sex TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' class SVTest(unittest.TestCase): def test_overwrite_male_non_par_calls(self) -> None: - pass + mt = import_callset(TEST_SV_VCF, ReferenceGenome.GRCh38, DatasetType.SV) + mt = select_relevant_fields( + mt, + DatasetType.SV, + ) + mt = subset_samples( + mt, + hl.Table.parallelize( + [{'s': sample_id} for sample_id in ['RGP_164_1', 'RGP_164_2']], + hl.tstruct(s=hl.dtype('str')), + key='s', + ), + ) + mt = overwrite_male_non_par_calls( + mt, + { + Family( + family_guid='family_1', + samples={ + 'RGP_164_1': Sample(sample_id='RGP_164_1', sex=Sex.FEMALE), + 'RGP_164_2': Sample(sample_id='RGP_164_2', sex=Sex.MALE), + }, + ), + }, + ) + mt = mt.filter_rows(mt.locus.contig == 'chrX') + self.assertEqual( + [ + hl.Locus(contig='chrX', position=3, reference_genome='GRCh38'), + hl.Locus(contig='chrX', position=2781700, reference_genome='GRCh38'), + ], + mt.locus.collect(), + ) + self.assertEqual( + [ + hl.Call(alleles=[0, 0], phased=False), + # This call belongs to a MALE and intercepts + # the par region on the X chromosome. Therefore it is modified. + hl.Call(alleles=[1, 1], phased=False), + hl.Call(alleles=[0, 0], phased=False), + hl.Call(alleles=[0, 1], phased=False), + ], + mt.GT.collect(), + ) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index badc6e442..171686e29 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -70,6 +70,7 @@ def mito_contig(self) -> str: ReferenceGenome.GRCh38: 'chrM', }[self] + @property def x_contig(self) -> str: return 'X' if self == ReferenceGenome.GRCh37 else 'chrX' diff --git a/v03_pipeline/var/test/callsets/sv_1.vcf b/v03_pipeline/var/test/callsets/sv_1.vcf index b38c26821..04f33ca85 100644 --- a/v03_pipeline/var/test/callsets/sv_1.vcf +++ b/v03_pipeline/var/test/callsets/sv_1.vcf @@ -85,4 +85,6 @@ chr1 16088760 CPX_chr1_41 N 684 PASS ALGORITHMS=manta;CHR2=chr1;CPX_INTERV chr1 17465707 INS_chr1_268 N 263 HIGH_SR_BACKGROUND ALGORITHMS=melt;CHR2=chr1;END=17465723;EVIDENCE=SR;PREDICTED_INTERGENIC;PREDICTED_NEAREST_TSS=RCC2;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;SVLEN=955;SVTYPE=INS;AN=8;AC=1;AF=0.004466;N_BI_GENOS=2911;N_HOMREF=2885;N_HET=26;N_HOMALT=0;FREQ_HOMREF=0.991068;FREQ_HET=0.00893164;FREQ_HOMALT=0;MALE_AN=2894;MALE_AC=14;MALE_AF=0.004838;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=1433;MALE_N_HET=14;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.990325;MALE_FREQ_HET=0.00967519;MALE_FREQ_HOMALT=0;FEMALE_AN=2906;FEMALE_AC=11;FEMALE_AF=0.003785;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=1442;FEMALE_N_HET=11;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.992429;FEMALE_FREQ_HET=0.00757054;FEMALE_FREQ_HOMALT=0 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 0/1:SR:0:99:0:0:1 0/0:PE,SR:99:99:0:2:0 0/0:PE,SR:99:99:0:2:0 chr1 21427498 CPX_chr1_54 N 733 PASS ALGORITHMS=manta;CHR2=chr1;CPX_INTERVALS=DUP_chr1:21427498-21427959,INV_chr1:21427498-21480073,DEL_chr1:21480073-21480419;CPX_TYPE=dupINVdel;END=21480419;EVIDENCE=PE;PREDICTED_LOF=NBPF3;PREDICTED_NONCODING_BREAKPOINT=DNase,Tommerup_TADanno;PREDICTED_NONCODING_SPAN=DNase;SVLEN=52921;SVTYPE=CPX;AN=8;AC=4;AF=0.499656;N_BI_GENOS=2911;N_HOMREF=51;N_HET=2811;N_HOMALT=49;FREQ_HOMREF=0.0175198;FREQ_HET=0.965648;FREQ_HOMALT=0.0168327;MALE_AN=2894;MALE_AC=1453;MALE_AF=0.502073;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=19;MALE_N_HET=1403;MALE_N_HOMALT=25;MALE_FREQ_HOMREF=0.0131306;MALE_FREQ_HET=0.969592;MALE_FREQ_HOMALT=0.0172771;FEMALE_AN=2906;FEMALE_AC=1445;FEMALE_AF=0.497247;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=32;FEMALE_N_HET=1397;FEMALE_N_HOMALT=24;FEMALE_FREQ_HOMREF=0.0220234;FEMALE_FREQ_HET=0.961459;FEMALE_FREQ_HOMALT=0.0165175 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/1:PE:93:93:1:99:0 0/1:PE:79:79:1:99:0 0/1:PE:33:33:1:6:0 0/1:PE,SR:39:39:1:99:0 0/1:PE,SR:39:39:1:99:0 chr1 48963084 INS_chr1_688 N 526 HIGH_SR_BACKGROUND ALGORITHMS=melt;CHR2=chr1;END=48963135;EVIDENCE=SR;PREDICTED_INTRONIC=AGBL4;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;SVLEN=5520;SVTYPE=INS;AN=8;AC=1;AF=0.06338;N_BI_GENOS=2911;N_HOMREF=2544;N_HET=365;N_HOMALT=2;FREQ_HOMREF=0.873926;FREQ_HET=0.125386;FREQ_HOMALT=0.000687049;MALE_AN=2894;MALE_AC=177;MALE_AF=0.061161;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=1271;MALE_N_HET=175;MALE_N_HOMALT=1;MALE_FREQ_HOMREF=0.878369;MALE_FREQ_HET=0.12094;MALE_FREQ_HOMALT=0.000691085;FEMALE_AN=2906;FEMALE_AC=192;FEMALE_AF=0.06607;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=1262;FEMALE_N_HET=190;FEMALE_N_HOMALT=1;FEMALE_FREQ_HOMREF=0.868548;FEMALE_FREQ_HET=0.130764;FEMALE_FREQ_HOMALT=0.000688231 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 0/1:SR:0:99:0:0:1 0/1:SR:0:99:0:0:1 -chr1 180540234 CPX_chr1_251 N 999 UNRESOLVED ALGORITHMS=manta;CHR2=chr1;CPX_INTERVALS=DEL_chr1:180540234-181074767,INV_chr1:181074767-181074938;CPX_TYPE=delINV;END=181074952;EVIDENCE=PE,SR;PREDICTED_LOF=KIAA1614,MR1,STX6,XPR1;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;PREDICTED_NONCODING_SPAN=DNase,Enhancer;SVLEN=534718;SVTYPE=CPX;UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION;AN=8;AC=3;AF=0.251804;N_BI_GENOS=2911;N_HOMREF=1559;N_HET=1238;N_HOMALT=114;FREQ_HOMREF=0.535555;FREQ_HET=0.425283;FREQ_HOMALT=0.0391618;MALE_AN=2894;MALE_AC=724;MALE_AF=0.250173;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=784;MALE_N_HET=602;MALE_N_HOMALT=61;MALE_FREQ_HOMREF=0.541811;MALE_FREQ_HET=0.416033;MALE_FREQ_HOMALT=0.0421562;FEMALE_AN=2906;FEMALE_AC=736;FEMALE_AF=0.253269;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=770;FEMALE_N_HET=630;FEMALE_N_HOMALT=53;FEMALE_FREQ_HOMREF=0.529938;FEMALE_FREQ_HET=0.433586;FEMALE_FREQ_HOMALT=0.0364763 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/1:PE,SR:41:26:1:41:1 1/1:PE,SR:89:33:1:89:2 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 \ No newline at end of file +chr1 180540234 CPX_chr1_251 N 999 UNRESOLVED ALGORITHMS=manta;CHR2=chr1;CPX_INTERVALS=DEL_chr1:180540234-181074767,INV_chr1:181074767-181074938;CPX_TYPE=delINV;END=181074952;EVIDENCE=PE,SR;PREDICTED_LOF=KIAA1614,MR1,STX6,XPR1;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;PREDICTED_NONCODING_SPAN=DNase,Enhancer;SVLEN=534718;SVTYPE=CPX;UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION;AN=8;AC=3;AF=0.251804;N_BI_GENOS=2911;N_HOMREF=1559;N_HET=1238;N_HOMALT=114;FREQ_HOMREF=0.535555;FREQ_HET=0.425283;FREQ_HOMALT=0.0391618;MALE_AN=2894;MALE_AC=724;MALE_AF=0.250173;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=784;MALE_N_HET=602;MALE_N_HOMALT=61;MALE_FREQ_HOMREF=0.541811;MALE_FREQ_HET=0.416033;MALE_FREQ_HOMALT=0.0421562;FEMALE_AN=2906;FEMALE_AC=736;FEMALE_AF=0.253269;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=770;FEMALE_N_HET=630;FEMALE_N_HOMALT=53;FEMALE_FREQ_HOMREF=0.529938;FEMALE_FREQ_HET=0.433586;FEMALE_FREQ_HOMALT=0.0364763 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/1:PE,SR:41:26:1:41:1 1/1:PE,SR:89:33:1:89:2 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 +chrX 3 CPX_chrX_251 N 999 UNRESOLVED ALGORITHMS=manta;CPX_INTERVALS=DEL_chr1:180540234-181074767,INV_chr1:181074767-181074938;CPX_TYPE=delINV;END=2781000;EVIDENCE=PE,SR;PREDICTED_LOF=KIAA1614,MR1,STX6,XPR1;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;PREDICTED_NONCODING_SPAN=DNase,Enhancer;SVLEN=534718;SVTYPE=CPX;UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION;AN=8;AC=3;AF=0.251804;N_BI_GENOS=2911;N_HOMREF=1559;N_HET=1238;N_HOMALT=114;FREQ_HOMREF=0.535555;FREQ_HET=0.425283;FREQ_HOMALT=0.0391618;MALE_AN=2894;MALE_AC=724;MALE_AF=0.250173;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=784;MALE_N_HET=602;MALE_N_HOMALT=61;MALE_FREQ_HOMREF=0.541811;MALE_FREQ_HET=0.416033;MALE_FREQ_HOMALT=0.0421562;FEMALE_AN=2906;FEMALE_AC=736;FEMALE_AF=0.253269;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=770;FEMALE_N_HET=630;FEMALE_N_HOMALT=53;FEMALE_FREQ_HOMREF=0.529938;FEMALE_FREQ_HET=0.433586;FEMALE_FREQ_HOMALT=0.0364763 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/1:PE,SR:41:26:1:41:1 1/1:PE,SR:89:33:1:89:2 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 +chrX 2781700 CPX_chrX_252 N 999 UNRESOLVED ALGORITHMS=manta;CPX_INTERVALS=DEL_chr1:180540234-181074767,INV_chr1:181074767-181074938;CPX_TYPE=delINV;END=2781900;EVIDENCE=PE,SR;PREDICTED_LOF=KIAA1614,MR1,STX6,XPR1;PREDICTED_NONCODING_BREAKPOINT=Tommerup_TADanno;PREDICTED_NONCODING_SPAN=DNase,Enhancer;SVLEN=534718;SVTYPE=CPX;UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION;AN=8;AC=3;AF=0.251804;N_BI_GENOS=2911;N_HOMREF=1559;N_HET=1238;N_HOMALT=114;FREQ_HOMREF=0.535555;FREQ_HET=0.425283;FREQ_HOMALT=0.0391618;MALE_AN=2894;MALE_AC=724;MALE_AF=0.250173;MALE_N_BI_GENOS=1447;MALE_N_HOMREF=784;MALE_N_HET=602;MALE_N_HOMALT=61;MALE_FREQ_HOMREF=0.541811;MALE_FREQ_HET=0.416033;MALE_FREQ_HOMALT=0.0421562;FEMALE_AN=2906;FEMALE_AC=736;FEMALE_AF=0.253269;FEMALE_N_BI_GENOS=1453;FEMALE_N_HOMREF=770;FEMALE_N_HET=630;FEMALE_N_HOMALT=53;FEMALE_FREQ_HOMREF=0.529938;FEMALE_FREQ_HET=0.433586;FEMALE_FREQ_HOMALT=0.0364763 GT:EV:GQ:PE_GQ:PE_GT:SR_GQ:SR_GT 0/0:PE,SR:99:99:0:99:0 0/1:PE,SR:41:26:1:41:1 1/1:PE,SR:89:33:1:89:2 0/0:PE,SR:99:99:0:99:0 0/0:PE,SR:99:99:0:99:0 \ No newline at end of file From 524de9d051db3846c4ec4dc91681f60d8e265860 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 17:19:47 -0500 Subject: [PATCH 3/7] do it correctly :/ --- v03_pipeline/lib/misc/sv.py | 16 ++++++++-------- v03_pipeline/lib/misc/sv_test.py | 10 ++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/v03_pipeline/lib/misc/sv.py b/v03_pipeline/lib/misc/sv.py index 5c219e9a8..b95456d09 100644 --- a/v03_pipeline/lib/misc/sv.py +++ b/v03_pipeline/lib/misc/sv.py @@ -22,6 +22,10 @@ def overwrite_male_non_par_calls( if i.start.contig == ReferenceGenome.GRCh38.x_contig ], ) + non_par_interval = hl.interval( + par_intervals[0].end, + par_intervals[1].start, + ) # NB: making use of existing formatting_annotation_fns. # We choose to annotate & drop here as the sample level # fields are dropped by the time we format variants. @@ -33,14 +37,10 @@ def overwrite_male_non_par_calls( GT=hl.if_else( ( male_sample_ids.contains(mt.s) - & hl.any( - par_intervals.map( - lambda i: i.overlaps( - hl.interval( - mt.start_locus, - mt.end_locus, - ), - ), + & non_par_interval.overlaps( + hl.interval( + mt.start_locus, + mt.end_locus, ), ) ), diff --git a/v03_pipeline/lib/misc/sv_test.py b/v03_pipeline/lib/misc/sv_test.py index 62c67250a..2c0696903 100644 --- a/v03_pipeline/lib/misc/sv_test.py +++ b/v03_pipeline/lib/misc/sv_test.py @@ -49,11 +49,13 @@ def test_overwrite_male_non_par_calls(self) -> None: self.assertEqual( [ hl.Call(alleles=[0, 0], phased=False), - # This call belongs to a MALE and intercepts - # the par region on the X chromosome. Therefore it is modified. - hl.Call(alleles=[1, 1], phased=False), - hl.Call(alleles=[0, 0], phased=False), + # END of this variant < start of the non-par region. hl.Call(alleles=[0, 1], phased=False), + hl.Call(alleles=[0, 0], phased=False), + hl.Call(alleles=[1, 1], phased=False), ], mt.GT.collect(), ) + self.assertFalse( + hasattr(mt, 'start_locus') + ) From d095e9c9661f21c406697e42b7d56ec62fd3dd3c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 17:22:18 -0500 Subject: [PATCH 4/7] ruff --- v03_pipeline/lib/misc/sv_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/sv_test.py b/v03_pipeline/lib/misc/sv_test.py index 2c0696903..eead9778e 100644 --- a/v03_pipeline/lib/misc/sv_test.py +++ b/v03_pipeline/lib/misc/sv_test.py @@ -57,5 +57,5 @@ def test_overwrite_male_non_par_calls(self) -> None: mt.GT.collect(), ) self.assertFalse( - hasattr(mt, 'start_locus') + hasattr(mt, 'start_locus'), ) From 116bb9b0450e31a66aa1f712940d581257da0201 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 17:40:01 -0500 Subject: [PATCH 5/7] correct feature flag --- v03_pipeline/lib/model/feature_flag.py | 4 ++++ .../lib/tasks/write_remapped_and_subsetted_callset.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py index 8a033f60f..047310996 100644 --- a/v03_pipeline/lib/model/feature_flag.py +++ b/v03_pipeline/lib/model/feature_flag.py @@ -11,6 +11,9 @@ INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' ) +OVERWRITE_SV_MALE_NON_PAR_CALLS = ( + os.environ.get('OVERWRITE_SV_MALE_NON_PAR_CALLS') == '1' +) RUN_PIPELINE_ON_DATAPROC = os.environ.get('RUN_PIPELINE_ON_DATAPROC') == '1' SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = ( os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1' @@ -24,5 +27,6 @@ class FeatureFlag: EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX + OVERWRITE_SV_MALE_NON_PAR_CALLS: bool = OVERWRITE_SV_MALE_NON_PAR_CALLS RUN_PIPELINE_ON_DATAPROC: bool = RUN_PIPELINE_ON_DATAPROC SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 15111f7ab..0715b94b0 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -176,7 +176,7 @@ def create_table(self) -> hl.MatrixTable: if field not in self.dataset_type.row_fields: mt = mt.drop(field) if ( - FeatureFlag.overwrite_male_non_par_calls + FeatureFlag.OVERWRITE_SV_MALE_NON_PAR_CALLS and self.dataset_type.overwrite_male_non_par_calls ): overwrite_male_non_par_calls( From 8ff45f9602b132d8b1b8432e0dcb16c9c105b4f6 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 5 Feb 2025 22:53:30 -0500 Subject: [PATCH 6/7] strvctvre --- .cloudbuild/strvctvre-docker.cloudbuild.yaml | 12 +++++++ .../download_strvctvre_reference_data.bash | 32 +++++++++++++++++++ v03_pipeline/bin/strvctvre | 19 +++++++++++ v03_pipeline/deploy/Dockerfile.strvctvre | 17 ++++++++++ v03_pipeline/lib/model/environment.py | 5 +++ 5 files changed, 85 insertions(+) create mode 100644 .cloudbuild/strvctvre-docker.cloudbuild.yaml create mode 100644 v03_pipeline/bin/download_strvctvre_reference_data.bash create mode 100644 v03_pipeline/bin/strvctvre create mode 100644 v03_pipeline/deploy/Dockerfile.strvctvre diff --git a/.cloudbuild/strvctvre-docker.cloudbuild.yaml b/.cloudbuild/strvctvre-docker.cloudbuild.yaml new file mode 100644 index 000000000..39a47d4ac --- /dev/null +++ b/.cloudbuild/strvctvre-docker.cloudbuild.yaml @@ -0,0 +1,12 @@ +# Run locally with: +# +# gcloud builds submit --quiet --config .cloudbuild/strvctvre-docker.cloudbuild.yaml v03_pipeline/deploy +steps: +- name: 'gcr.io/kaniko-project/executor:v1.3.0' + args: + - --destination=gcr.io/seqr-project/strvctvre-docker-image + - --dockerfile=Dockerfile.strvctvre + - --cache=true + - --cache-ttl=168h + +timeout: 1800s \ No newline at end of file diff --git a/v03_pipeline/bin/download_strvctvre_reference_data.bash b/v03_pipeline/bin/download_strvctvre_reference_data.bash new file mode 100644 index 000000000..9292b8b65 --- /dev/null +++ b/v03_pipeline/bin/download_strvctvre_reference_data.bash @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -eux + +REFERENCE_GENOME=$1 +STRVCTVRE_REFERENCE_DATASETS_DIR=${STRVCTVRE_REFERENCE_DATASETS_DIR:-/var/seqr/strvctvre-reference-data} + +case $REFERENCE_GENOME in + GRCh38) + STRVCTVRE_REFERENCE_DATA_FILES=( + 'gs://seqr-reference-data/strvctvre_data/hg38.phyloP100way.bw' + ) + ;; + *) + echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh38" + exit 1 +esac + +if [ -f "$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS ]; then + echo "Skipping download because already successful" + exit 0; +fi + +mkdir -p "$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"; +rm -rf "${STRVCTVRE_REFERENCE_DATASETS_DIR:?}"/"${REFERENCE_GENOME:?}"/*; + +for reference_data_file in "${STRVCTVRE_REFERENCE_DATA_FILES[@]}"; do + echo "Downloading" "$reference_data_file"; + gsutil cp "$reference_data_file" "$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/ & +done; +wait +touch "$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS diff --git a/v03_pipeline/bin/strvctvre b/v03_pipeline/bin/strvctvre new file mode 100644 index 000000000..69ad67ecd --- /dev/null +++ b/v03_pipeline/bin/strvctvre @@ -0,0 +1,19 @@ +#!/bin/bash + +set -eux + +REFERENCE_GENOME=$1 +STRVCTVRE_REFERENCE_DATASETS_DIR=${STRVCTVRE_REFERENCE_DATASETS_DIR:-/var/seqr/strvctvre-reference-data} +STRVCTVRE_DOCKER_IMAGE="gcr.io/seqr-project/strvctvre-docker-image" + +case $REFERENCE_GENOME in + GRCh38) + ;; + *) + echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh38" + exit 1 +esac + +shift # Remove the REFERENCE_GENOME arg. +BIGWIG_FILE="$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME/hg38.phyloP100way.bw" +docker run --platform linux/amd64 -i -v $BIGWIG_FILE:$BIGWIG_FILE:ro $STRVCTVRE_DOCKER_IMAGE -p $BIGWIG_FILE "$@" diff --git a/v03_pipeline/deploy/Dockerfile.strvctvre b/v03_pipeline/deploy/Dockerfile.strvctvre new file mode 100644 index 000000000..97d96e3a7 --- /dev/null +++ b/v03_pipeline/deploy/Dockerfile.strvctvre @@ -0,0 +1,17 @@ +FROM python:3.11-bullseye as build +LABEL maintainer="Broad TGG" + +# Sourced from https://stackoverflow.com/a/58269633 +ENV PATH="/root/miniconda3/bin:${PATH}" +RUN apt-get update && \ + apt-get install -y wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + mkdir -p /root/.conda && \ + bash miniconda.sh -b -p /root/miniconda3 && \ + rm -f miniconda.sh + +RUN git clone --depth 1 https://github.com/andrewSharo/StrVCTVRE +RUN conda env create -f StrVCTVRE/environment_py3.yml +ENTRYPOINT ["conda", "run", "-n", "StrVCTVRE_py_3", "python3", "StrVCTVRE/StrVCTVRE.py"] \ No newline at end of file diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 448ac6e16..ec8eed049 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -20,6 +20,10 @@ 'REFERENCE_DATASETS_DIR', '/var/seqr/seqr-reference-data', ) +STRVCTVRE_REFERENCE_DATASETS_DIR = os.environ.get( + 'STRVCTVRE_REFERENCE_DATASETS_DIR', + '/var/seqr/strvctvre-reference-data', +) VEP_REFERENCE_DATASETS_DIR = os.environ.get( 'VEP_REFERENCE_DATASETS_DIR', '/var/seqr/vep-reference-data', @@ -65,4 +69,5 @@ class Env: REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR SEQR_APP_HAIL_SEARCH_DATA_DIR: str | None = SEQR_APP_HAIL_SEARCH_DATA_DIR SEQR_APP_REFERENCE_DATASETS_DIR: str | None = SEQR_APP_REFERENCE_DATASETS_DIR + STRVCTVRE_REFERENCE_DATASETS_DIR: str = STRVCTVRE_REFERENCE_DATASETS_DIR VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR From 2b6ab30a3e930ce36a2a0bb2dd1c24a1117e471c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 7 Feb 2025 11:02:09 -0500 Subject: [PATCH 7/7] updates for directory structure --- v03_pipeline/bin/strvctvre | 14 +------------- v03_pipeline/deploy/Dockerfile.strvctvre | 5 +++-- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/v03_pipeline/bin/strvctvre b/v03_pipeline/bin/strvctvre index 69ad67ecd..95ac876af 100644 --- a/v03_pipeline/bin/strvctvre +++ b/v03_pipeline/bin/strvctvre @@ -2,18 +2,6 @@ set -eux -REFERENCE_GENOME=$1 STRVCTVRE_REFERENCE_DATASETS_DIR=${STRVCTVRE_REFERENCE_DATASETS_DIR:-/var/seqr/strvctvre-reference-data} STRVCTVRE_DOCKER_IMAGE="gcr.io/seqr-project/strvctvre-docker-image" - -case $REFERENCE_GENOME in - GRCh38) - ;; - *) - echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh38" - exit 1 -esac - -shift # Remove the REFERENCE_GENOME arg. -BIGWIG_FILE="$STRVCTVRE_REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME/hg38.phyloP100way.bw" -docker run --platform linux/amd64 -i -v $BIGWIG_FILE:$BIGWIG_FILE:ro $STRVCTVRE_DOCKER_IMAGE -p $BIGWIG_FILE "$@" +docker run --platform linux/amd64 -i -v $STRVCTVRE_DOCKER_IMAGE:$STRVCTVRE_DOCKER_IMAGE:ro $STRVCTVRE_DOCKER_IMAGE -p $WORKDIR "$@" diff --git a/v03_pipeline/deploy/Dockerfile.strvctvre b/v03_pipeline/deploy/Dockerfile.strvctvre index 97d96e3a7..8306e9a57 100644 --- a/v03_pipeline/deploy/Dockerfile.strvctvre +++ b/v03_pipeline/deploy/Dockerfile.strvctvre @@ -13,5 +13,6 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - rm -f miniconda.sh RUN git clone --depth 1 https://github.com/andrewSharo/StrVCTVRE -RUN conda env create -f StrVCTVRE/environment_py3.yml -ENTRYPOINT ["conda", "run", "-n", "StrVCTVRE_py_3", "python3", "StrVCTVRE/StrVCTVRE.py"] \ No newline at end of file +WORKDIR /StrVCTVRE +RUN conda env create -f environment_py3.yml +ENTRYPOINT ["conda", "run", "-n", "StrVCTVRE_py_3", "python3", "StrVCTVRE.py"] \ No newline at end of file