From b4824871720805115a5f236ed1383c073850eecd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 16:30:18 -0400 Subject: [PATCH 01/55] Formatting function enums --- v03_pipeline/lib/annotations/misc.py | 106 +++++++++++++++++- .../base_update_variant_annotations_table.py | 4 +- 2 files changed, 107 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index 17ee4430e..da5a48c88 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -17,7 +17,111 @@ from v03_pipeline.lib.model.definitions import ReferenceGenome -def annotate_enums( +def deannotate_formatting_annotation_enums( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> hl.Table: + formatting_annotation_names = { + fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) + } + if 'sorted_motif_feature_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_motif_feature_consequences=ht.sorted_motif_feature_consequences.map( + lambda c: c.annotate( + consequence_terms=c.consequence_term_ids.map( + lambda tid: MOTIF_CONSEQUENCE_TERMS[tid], + ).drop('consequence_term_ids'), + ), + ), + ) + ht = ht.annotate_globals( + enums=ht.enums.drop('sorted_motif_feature_consequences'), + ) + if 'sorted_regulatory_feature_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_regulatory_feature_consequences=ht.sorted_regulatory_feature_consequences.map( + lambda c: c.annotate( + biotype=REGULATORY_BIOTYPES[c.biotype_id], + consequence_terms=c.consequence_term_ids.map( + lambda tid: REGULATORY_CONSEQUENCE_TERMS[tid], + ).drop('biotype_id', 'consequence_term_ids'), + ), + ), + ) + ht = ht.annotate_globals( + enums=ht.enums.drop('sorted_regulatory_feature_consequences'), + ) + if 'sorted_transcript_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_transcript_consequences=ht.sorted_transcript_consequences.map( + lambda c: c.annotate( + biotype=BIOTYPES[c.biotype], + consequence_terms=c.consequence_term_ids.map( + lambda tid: TRANSCRIPT_CONSEQUENCE_TERMS[tid], + ), + **{ + 'loftee': c.loftee.annotate( + lof_filters=c.loftee.lof_filter_ids.map( + lambda fid: LOF_FILTERS[fid], + ), + ).drop('lof_filter_ids'), + 'utrannotator': c.utrannotator.annotate( + fiveutr_consequence=FIVEUTR_CONSEQUENCES[ + c.utrannotator.fiveutr_consequence_id + ], + ).drop('fiveutr_consequence_id'), + } + if reference_genome == ReferenceGenome.GRCh38 + and dataset_type == DatasetType.SNV_INDEL + else { + 'lof_filters': c.lof_filter_ids.map( + lambda fid: LOF_FILTERS[fid], + ), + }, + ).drop( + 'biotype_id', + 'consequence_term_ids', + **( + [] + if reference_genome == ReferenceGenome.GRCh38 + and dataset_type == DatasetType.SNV_INDEL + else [ + 'lof_filter_ids', + ] + ), + ), + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('sorted_transcript_consequences')) + if 'mitotip' in formatting_annotation_names: + ht = ht.annotate( + mitotip=hl.Struct( + trna_prediction=MITOTIP_PATHOGENICITIES[ht.mitotip.trna_prediction_id], + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) + if 'sv_type_id' in formatting_annotation_names: + ht = ht.annotate(sv_type=SV_TYPES[ht.sv_type_id]).drop('sv_type_id') + ht = ht.annotate_globals(enums=ht.enums.drop('sv_type')) + if 'sv_type_detail_id' in formatting_annotation_names: + ht = ht.annotate_globals( + enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('sv_type_detail')) + if 'sorted_gene_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_gene_consequences=ht.sorted_gene_consequences.map( + lambda c: c.annotate( + major_consequence=SV_CONSEQUENCE_RANKS[c.major_consequence_id], + ).drop('major_consequence_id'), + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('sorted_gene_consequences')) + return ht + + +def annotate_formatting_annotation_enums( ht: hl.Table, reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 29c374e48..2497793ad 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -1,7 +1,7 @@ import hail as hl import luigi -from v03_pipeline.lib.annotations.misc import annotate_enums +from v03_pipeline.lib.annotations.misc import annotate_formatting_annotation_enums from v03_pipeline.lib.paths import ( valid_reference_dataset_path, variant_annotations_table_path, @@ -101,4 +101,4 @@ def annotate_globals( updates=ht.globals.updates, migrations=ht.globals.migrations, ) - return annotate_enums(ht, self.reference_genome, self.dataset_type) + return annotate_formatting_annotation_enums(ht, self.reference_genome, self.dataset_type) From f1a5a6cbdeb8d1912c85e5ccb7d8e3efcc4072d3 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 16:59:04 -0400 Subject: [PATCH 02/55] improve function name --- v03_pipeline/lib/annotations/misc.py | 4 ++-- .../tasks/base/base_update_variant_annotations_table.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index da5a48c88..1f750991d 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -17,7 +17,7 @@ from v03_pipeline.lib.model.definitions import ReferenceGenome -def deannotate_formatting_annotation_enums( +def deannotate_formatting_annotation_enum_globals( ht: hl.Table, reference_genome: ReferenceGenome, dataset_type: DatasetType, @@ -121,7 +121,7 @@ def deannotate_formatting_annotation_enums( return ht -def annotate_formatting_annotation_enums( +def annotate_formatting_annotation_enum_globals( ht: hl.Table, reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 2497793ad..d6eb1be36 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -1,7 +1,9 @@ import hail as hl import luigi -from v03_pipeline.lib.annotations.misc import annotate_formatting_annotation_enums +from v03_pipeline.lib.annotations.misc import ( + annotate_formatting_annotation_enum_globals, +) from v03_pipeline.lib.paths import ( valid_reference_dataset_path, variant_annotations_table_path, @@ -101,4 +103,6 @@ def annotate_globals( updates=ht.globals.updates, migrations=ht.globals.migrations, ) - return annotate_formatting_annotation_enums(ht, self.reference_genome, self.dataset_type) + return annotate_formatting_annotation_enum_globals( + ht, self.reference_genome, self.dataset_type + ) From 18a38c952ea161282c84f8a2bc35f23233c59235 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 17:14:29 -0400 Subject: [PATCH 03/55] handle lookups that propagate missing --- v03_pipeline/lib/annotations/misc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index 1f750991d..12822686b 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -67,7 +67,8 @@ def deannotate_formatting_annotation_enum_globals( ), ).drop('lof_filter_ids'), 'utrannotator': c.utrannotator.annotate( - fiveutr_consequence=FIVEUTR_CONSEQUENCES[ + # NB: FIVEUTR_CONSEQUENCES_LOOKUP propagates missing + fiveutr_consequence=hl.array(FIVEUTR_CONSEQUENCES)[ c.utrannotator.fiveutr_consequence_id ], ).drop('fiveutr_consequence_id'), @@ -97,7 +98,8 @@ def deannotate_formatting_annotation_enum_globals( if 'mitotip' in formatting_annotation_names: ht = ht.annotate( mitotip=hl.Struct( - trna_prediction=MITOTIP_PATHOGENICITIES[ht.mitotip.trna_prediction_id], + # MITOTIP_PATHOGENICITIES_LOOKUP propagates missing + trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ht.mitotip.trna_prediction_id], ), ) ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) From e006df043d1a97e6713d48919189211848e8bbb5 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 17:23:26 -0400 Subject: [PATCH 04/55] format --- v03_pipeline/lib/annotations/misc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index 12822686b..c92fc5c75 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -99,7 +99,9 @@ def deannotate_formatting_annotation_enum_globals( ht = ht.annotate( mitotip=hl.Struct( # MITOTIP_PATHOGENICITIES_LOOKUP propagates missing - trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ht.mitotip.trna_prediction_id], + trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ + ht.mitotip.trna_prediction_id + ], ), ) ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) From 27d2464a220e25df1c0ae33c9beacfc5e43d3177 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 17:23:57 -0400 Subject: [PATCH 05/55] ruff --- .../lib/tasks/base/base_update_variant_annotations_table.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index d6eb1be36..f84397827 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -104,5 +104,7 @@ def annotate_globals( migrations=ht.globals.migrations, ) return annotate_formatting_annotation_enum_globals( - ht, self.reference_genome, self.dataset_type + ht, + self.reference_genome, + self.dataset_type, ) From 345a619c915ef640debb16542f7bff18cb34be4d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 17:36:51 -0400 Subject: [PATCH 06/55] rename func --- v03_pipeline/lib/annotations/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index c92fc5c75..8c16d0119 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -17,7 +17,7 @@ from v03_pipeline.lib.model.definitions import ReferenceGenome -def deannotate_formatting_annotation_enum_globals( +def unmap_formatting_annotation_enums( ht: hl.Table, reference_genome: ReferenceGenome, dataset_type: DatasetType, From f4c611c5c3b3ed678392532a6a535eb8452bbb98 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 19:20:21 -0400 Subject: [PATCH 07/55] second func --- v03_pipeline/lib/annotations/misc.py | 68 ++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index 8c16d0119..fb2e2372f 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -15,6 +15,74 @@ ) from v03_pipeline.lib.model import DatasetType from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + + +def unmap_reference_dataset_annotation_enums( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> hl.Table: + formatting_annotation_names = { + fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) + } + removed_enum_names = [] + for annotation_name in ht.enums: + if annotation_name in formatting_annotation_names: + continue + for enum_name, enum_values in ht.enums[annotation_name].values(): + if hasattr(ht[annotation_name], f'{enum_name}_ids'): + ht = ht.annotate( + **{ + annotation_name: ht[annotation_name].annotate( + **{ + f'{enum_name}s': ht[annotation_name][ + f'{enum_name}_ids' + ].map(lambda idx: enum_values[idx]), # noqa: B023 + }, + ), + }, + ) + ht = ht.annotate( + **{annotation_name: ht[annotation_name].drop(f'{enum_name}_ids')}, + ) + else: + ht = ht.annotate( + **{ + annotation_name: ht[annotation_name].annotate( + **{ + enum_name: enum_values[ + ht[annotation_name][f'{enum_name}_id'] + ], + }, + ), + }, + ) + ht = ht.annotate( + **{annotation_name: ht[annotation_name].drop(f'{enum_name}_id')}, + ) + removed_enum_names.add(enum_name) + + # Explicit clinvar edge case: + if hasattr(ht, ReferenceDataset.clinvar.value): + ht = ht.annotate( + **{ + ReferenceDataset.clinvar.value: ht[ + ReferenceDataset.clinvar.value + ].annotate( + conflictingPathogenicities=ht[ + ReferenceDataset.clinvar.value + ].conflictingPathogenicities.map( + lambda s: s.annotate( + pathogenicity=ht.enums.clinvar.pathogenicity[ + s.pathogenicity_id + ], + ).drop('pathogenicity_id'), + ), + ), + }, + ) + return ht.annotate_globals(enums=ht.globals.enums.drop(*removed_enum_names)) def unmap_formatting_annotation_enums( From 6931585cd6e1fdfc7c8e2bf33abf674da8f87311 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 9 Apr 2025 21:23:03 -0400 Subject: [PATCH 08/55] start test --- v03_pipeline/lib/annotations/misc.py | 36 +++++++++--------- v03_pipeline/lib/annotations/misc_test.py | 23 +++++++++++ .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 0 -> 12 bytes .../SNV_INDEL/annotations.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../annotations.ht/.metadata.json.gz.crc | Bin 0 -> 20 bytes .../SNV_INDEL/annotations.ht/README.txt | 3 ++ .../GRCh38/SNV_INDEL/annotations.ht/_SUCCESS | 0 .../globals/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../annotations.ht/globals/metadata.json.gz | Bin 0 -> 751 bytes .../annotations.ht/globals/parts/.part-0.crc | Bin 0 -> 104 bytes .../annotations.ht/globals/parts/part-0 | Bin 0 -> 11895 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 87 bytes .../metadata.json.gz | Bin 0 -> 184 bytes .../SNV_INDEL/annotations.ht/metadata.json.gz | Bin 0 -> 1315 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 0 -> 24 bytes .../annotations.ht/rows/metadata.json.gz | Bin 0 -> 1815 bytes ...0-034376f0-4c6b-4bf0-8912-4035d651b982.crc | Bin 0 -> 16 bytes ...art-0-034376f0-4c6b-4bf0-8912-4035d651b982 | Bin 0 -> 928 bytes 20 files changed, 44 insertions(+), 18 deletions(-) create mode 100644 v03_pipeline/lib/annotations/misc_test.py create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/._SUCCESS.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/_SUCCESS create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/part-0 create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.index.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/index create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index fb2e2372f..a4d685daf 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -98,9 +98,9 @@ def unmap_formatting_annotation_enums( sorted_motif_feature_consequences=ht.sorted_motif_feature_consequences.map( lambda c: c.annotate( consequence_terms=c.consequence_term_ids.map( - lambda tid: MOTIF_CONSEQUENCE_TERMS[tid], - ).drop('consequence_term_ids'), - ), + lambda tid: hl.array(MOTIF_CONSEQUENCE_TERMS)[tid], + ), + ).drop('consequence_term_ids'), ), ) ht = ht.annotate_globals( @@ -110,11 +110,11 @@ def unmap_formatting_annotation_enums( ht = ht.annotate( sorted_regulatory_feature_consequences=ht.sorted_regulatory_feature_consequences.map( lambda c: c.annotate( - biotype=REGULATORY_BIOTYPES[c.biotype_id], + biotype=hl.array(REGULATORY_BIOTYPES)[c.biotype_id], consequence_terms=c.consequence_term_ids.map( - lambda tid: REGULATORY_CONSEQUENCE_TERMS[tid], - ).drop('biotype_id', 'consequence_term_ids'), - ), + lambda tid: hl.array(REGULATORY_CONSEQUENCE_TERMS)[tid], + ), + ).drop('biotype_id', 'consequence_term_ids'), ), ) ht = ht.annotate_globals( @@ -124,18 +124,17 @@ def unmap_formatting_annotation_enums( ht = ht.annotate( sorted_transcript_consequences=ht.sorted_transcript_consequences.map( lambda c: c.annotate( - biotype=BIOTYPES[c.biotype], + biotype=hl.array(BIOTYPES)[c.biotype_id], consequence_terms=c.consequence_term_ids.map( - lambda tid: TRANSCRIPT_CONSEQUENCE_TERMS[tid], + lambda tid: hl.array(TRANSCRIPT_CONSEQUENCE_TERMS)[tid], ), **{ 'loftee': c.loftee.annotate( lof_filters=c.loftee.lof_filter_ids.map( - lambda fid: LOF_FILTERS[fid], + lambda fid: hl.array(LOF_FILTERS)[fid], ), ).drop('lof_filter_ids'), 'utrannotator': c.utrannotator.annotate( - # NB: FIVEUTR_CONSEQUENCES_LOOKUP propagates missing fiveutr_consequence=hl.array(FIVEUTR_CONSEQUENCES)[ c.utrannotator.fiveutr_consequence_id ], @@ -145,7 +144,7 @@ def unmap_formatting_annotation_enums( and dataset_type == DatasetType.SNV_INDEL else { 'lof_filters': c.lof_filter_ids.map( - lambda fid: LOF_FILTERS[fid], + lambda fid: hl.array(LOF_FILTERS)[fid], ), }, ).drop( @@ -166,7 +165,6 @@ def unmap_formatting_annotation_enums( if 'mitotip' in formatting_annotation_names: ht = ht.annotate( mitotip=hl.Struct( - # MITOTIP_PATHOGENICITIES_LOOKUP propagates missing trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ ht.mitotip.trna_prediction_id ], @@ -174,18 +172,20 @@ def unmap_formatting_annotation_enums( ) ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) if 'sv_type_id' in formatting_annotation_names: - ht = ht.annotate(sv_type=SV_TYPES[ht.sv_type_id]).drop('sv_type_id') + ht = ht.annotate(sv_type=hl.array(SV_TYPES)[ht.sv_type_id]).drop('sv_type_id') ht = ht.annotate_globals(enums=ht.enums.drop('sv_type')) if 'sv_type_detail_id' in formatting_annotation_names: - ht = ht.annotate_globals( - enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS), - ) + ht = ht.annotate( + sv_type_detail=hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id], + ).drop('sv_type_detail_id') ht = ht.annotate_globals(enums=ht.enums.drop('sv_type_detail')) if 'sorted_gene_consequences' in formatting_annotation_names: ht = ht.annotate( sorted_gene_consequences=ht.sorted_gene_consequences.map( lambda c: c.annotate( - major_consequence=SV_CONSEQUENCE_RANKS[c.major_consequence_id], + major_consequence=hl.array(SV_CONSEQUENCE_RANKS)[ + c.major_consequence_id + ], ).drop('major_consequence_id'), ), ) diff --git a/v03_pipeline/lib/annotations/misc_test.py b/v03_pipeline/lib/annotations/misc_test.py new file mode 100644 index 000000000..bd1afd143 --- /dev/null +++ b/v03_pipeline/lib/annotations/misc_test.py @@ -0,0 +1,23 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.annotations.misc import ( + unmap_formatting_annotation_enums, +) +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, +) + +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) + + +class MiscTest(unittest.TestCase): + def unmap_formatting_annotation_enums(self) -> None: + ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) + ht = unmap_formatting_annotation_enums( + ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ) diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..66f7dd3416ab4534150994a8aa38a75928280e64 GIT binary patch literal 12 TcmYc;N@ieSU}A{8c|{!n6FviI literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/._SUCCESS.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..513ecd0d33551fff1d663663e60e663c44852f6d GIT binary patch literal 20 ccmYc;N@ieSU}E_HC9Y|q`G>`RFE6nJ080D_i2wiq literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt new file mode 100644 index 000000000..6e4fca836 --- /dev/null +++ b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.133-4c60fddb171a + Created at 2025/04/09 21:10:13 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/_SUCCESS b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..2d0b45e3f15e98be14c524fe3ec0a037d86b89b8 GIT binary patch literal 16 XcmYc;N@ieSU}A`x`Tfk)h`WCQDMSXi literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..35c57a2a0e54bd49120bba5d1c00c918e0833fee GIT binary patch literal 751 zcmVevza}_6XZ#c6U!t zTnpe1PiDKGaJ#KRd^$Sl6gY@HZx(ZAVTq>UxZ8lSQY$-)P|FQ~a+p-iY*VpPHm|+S zOQ|-@c%V?aVj_blU4b3JN-I~RR0aJhi-t*s{*O`XKjh& zc`maSj3bX3Y0Yr0=@ODl+)hOtwcW>UUu=GOHX1?Dp9Ac8@Vf!UCtK1FD-7sI7NY&& zJjP3Ez%y41rWyu1&NjM)HV7iCG|D{7fnj5!+6PerPex;_rmE;d(mTfK-0$H>#{jPUbzrLLr?_LZ}?_77r?^*}!_osXA_NKG;`x1xl7oz9x7vW&^5;wZb zP}UB8Igq@hJtJlVLePm=!lH8n`5MF+weJkknvkp0LN&Uu3Y^<6*2Q&D@`nH(AzxXist~B9mS&HDuywv=WqC|&Q4qVDVbm02x>hgL_`(;t= zsjuWw$-t^1vvohZM9mC-ktJCAY)qW-_FdvWA0KX*DqyY~C7f%QSR^NfB@!rcc#w*h hjUcJsyQt%E#1`(1Ls}P&AsXvA?O*W)TtvMJ004;?X|ezS literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000000000000000000000000000000000..ebf652b1cd0c25590abae55095e71472147f1221 GIT binary patch literal 104 zcmV-u0GIz`a$^7h00IEErkg_pMEq!?Q^h)d&J^O+T(&zr@G;5IDaR7U39YogKGb;P z#gtQYQLy({1_^0Bl~Dr2*=2-xjKoisM%9H$GLpl|=$?so7J^pYo0(lY)1)-6g&wKB KsNz%wT1%D3z%keW literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/part-0 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/part-0 new file mode 100644 index 0000000000000000000000000000000000000000..0829aa95e130fa7a307b36652720db8610028783 GIT binary patch literal 11895 zcmV--E{M_3DgXcg0096fwJ-f(0RIhE0U8nNXHziT)uat0sp~Ib$v;KW1OoUiSvGON z7HC0AZ2%QSb@yYd{~LkSCbF@Wv_>iN%79{_+SwU9&v=jdPZee&ySux4d+8zo(1XV? z3}czbaX=ik{gVc526zVW*-qul*=#oveXdBIL!L&%gl&rgZD*;IRBo~j z=o7*;QYQ_uL8_fjB0`ftuTCcssdPFiNcErGrx6GW;bvt~8?&w<6u^?ou#M13LKLyW|N5+= zrj=>eVE+8dAguElX&_tTap&h{r^T_TKq$m;*Wf8`5H zo8{~Kf1YQ5;osL&BK%%-k_r=_m1)7ZuJ}NhmaxM+PKP{3SFP!&%P3ctlcRA?0jYTJH@FGb=`<=DoaEF zA%tX&LD?kY)2ZvTDi^`1P!Qdsvag;8(J9<1AK8^NEJub}6^?A8K-h=!HU0SL=oSx> z@o%aUVMH7xN){yG5sF-v4!*O5}pevbKPnc^S9kM+m8)rqvc%-8^O z_1Ho9_`wn*?Oyf35+mh}@$jJ=CM-=|Kd{6|R>yi^iIMuoZm`5i7Y{5k65bdU`hg`z zqCz)VVx$abFg~!vNEgmvd{_cyNssUUvH0jQ5%wtA;Udl8kY=PAC`_`{WlrHBy#++0?ctI_ALBRq7A`uv4EmF19ifi~_hvV`<#nQ8HHeP)HMOZ3C_ zyJt}UA8uF`*7KENqe(|bVffQ2-YJcf%n+$Wp&ymbSO!UwnH@JIj*}!coFs9v;l!mz z17KWiz_`@t;dlR$5|4Fj2f&KqfniqFEteb^W>uZap8#A|Ci=ZucHHOzNs#`b`s*Ft zqO$8TEMqH%$dJzO-aizUfKKJ?`rV%hxyh)>`gAJWR5f+0B>_K(bPC{uh|X7ERT=XK zVN|7ahP`y82;WpsLow+Lv8nza3zTIiIecLJvi4t?|L2pdsgH2Tx+aVvAnTdBat;3K z(WGrIVWn~bWVG79UDVeI;;TN?=PBWuE$)v=InQCX(8g<)9- zmhGP=79W6c!inLqa9lV%G#jhJaZ`0GhYXV!5f3sJ^;bc5fuoEMkEWsWpFrB9%jJxK4 zSvPs7+~&V*)xy^%_v}+RKO9(9qv@uiLV3b&S||AkS(p{Fs*JO|yR|zy(>S4o{PSxr zUfbYgWuu?JNn3yGB{$V*Np+A5jt&0(*SwU*_pn3)yC9Ky`<}IP?!URkXmx{=mTj8R zHti^!sFpb3)VM+vEYJlG7%7Ou6#1q<4TfKnZE($Vt4`~rBQj7t?Ztzn!LP|tT5CnT zc#zT1nd!scq9k*>se%=_uj9nx)*>f(K~80-kk@eQfZ% z7pHb-a`SnN(-iA&vew+<+p@jwVuQ2Bhg5RP7()$JTy7ti)>+D7ZBkR+=4;uuqf@Jo z&bMEQRaqFGChTf;k`MW&i_k$JiKpY_i#Ol*q@|5!o?CrVHV7xz^mEPa=44r83fuTQ zRlSH3;TjCnWch>Ztxi(G-0%2L`QPa_8}Du0YT5M6v2iD5ob(IF2RKLp0xeuQsgG5y zby5&ad|FlImaJ3OlFq;0liT@XeJ$o^8+2N>IJs}refcsUk@ABXDnLOLl=w@8v5%`Q z53o9R?&6R2xR0*CFjn)yAL~2%-E$T?>h9EPeB1bRvgND4y_7ZoZIaT%3R18G4)C5a z_7P>=Da=Jyw?a5p>w<(#2m6Ye@U&~@rCG;)?>AcMZSO+aX1=z?uNqEPNy-mhP$3Jm zKHT#X{9L3VK^=&Iufy4Hf@h7PJ{Io($rs18Rwo6C5MN5XjFoAbubY)mOJUo!-d1Cl z?QMRW<#0PVBL{n%o-Yb>?1S&EPVzDBnw9UA4q3ZX>(g zKUog;A!~`3RXS;i7aVJer@EE(Q7B%ZjU9Bb8ZEhklLTJkalfLQ43tqe81pNGR*G0D zqKFH$JJ@#%XIB<=9&~WO3x3gG<>Yp%FYe~}-*zYZW{bC8>on|wlVP7#TbPM`_v$11=PNdXS|rcP%5xYAlb4Q`@!Acuo|`vrYi@97+5SG-E_^!It2s85aDg$B zkh9G7g>~$%#nWFa-`^w2#9%yIm1xx2(@qz{>awQI?rPh+EE}AlY_n3yCkrQ#0;$8; z+c$Dw^ z!sqaWDXw%F``B=9bnycd`FU^BL!rT!ejm!bv>w4xM$F) zQ%Srkp@Z;Y8BdhXwp%`{?}IzHJ+*H9TG{xFvpIS1-tI9V1;)DqPSz)y&tj5=$g7_Ifl^0Ye>2S*6ZAY%;J$A(rX z3GqS{XV`%bH0q!Q9dM99242|04RGK=3w!`U4CdfP40sR*6h2g;33Q0U#syrk1r|PR z0Rt_Vp~n|95W)-6K;sA*AOITHfCCAnF$6Qh0Aq?IOdvI+poI^H5QY~)C_)x!d@+O` zyr=;iny5!CFa$Ex!G{ab;EX8TAY+RSh_T}dDR_Z}9CYY_6Jqd?0W3rjg*3{rg)MHN zp+yyL0K$wLbRY*jc)<%~U~!A+=s^s?h(Qx7nkYjIUC`nRI)u{!GsJKQ9zbXV3YGwc zE|yS*3$UmI32s;+h7-7;wdR3oX&%o^E25K)MA^u~Sk0%CJyun@CFaG$BDr9B?ZcAg zd67KW1Ji;%o*59N&?#d~0K*eexZsQ}+#n5D%s>f9Fhdtk;6M-FV22$nAqgMIfEa7= zB1-8b6&+2uA_-G8;SOh@QHfAO6Jv-W1Y?-s3N}p8MHOPC;SNu@!V@ysLJoEip#&=6 zqDC8}kbxC;fWiz!zylg0m;)BFz=al?@PP?ggpq?O(ttr3Ot66yZ}{N`ZV*EYK$szf z82I1{FwDRO4=6HVg)q)wgDlFBIp83I7nA@87?vP}FLY4h4R$C)5!r6SgBBaOLJVd| z0tdhtLKMj0fgYZ?;)xN|pn?op;6R5IQb^+qjs~0%1_*GlfCxYU!VMg3fDR_0q6sN@ z(1QhX;6f25bnwbB9lPgOohSLvHZHks+gaK2v%F3GU!JZ6Qfgd*1tUoDeUyp5<65na zAJ7Vf6u-ngnQ`(G)2Wfpnr)6&zb3D>&0Z^8l{YpDDdd{o6f5-HQYZD8cFoGPYjB2r z)``2)x}SS*>h|2V?Orb1zWlYwq5er;&U8;k#^b`_!GS1zFa~rHh#RnB15>D|nw~L+ zeXz6DNkOtKhc0WvvA&M}7v-dz&S;y>@3yH@*`{=^(fKm0@p7Di0u3yr)bABmn%lzI}d6w0JGOn_= zst64LVNSWb4x_Wl}hAfPo0Az{3r3pn(x~7GX4(WutS2yM;w++x0hA z2ee`x${iKlXUBIJ8dQ>OXVCZ}xMrcSnL zKHKJRx@+cesL;U^fYUD9^E`)L@LlZmQh#Q^B%mh0@o9viw<*Fy_e;Y*AN+v5!rM!~5v1J4#yHlx1#I zTiNV}v+)UKma$?Etijjwo|pJ4PtnUzz+o9rly}eOZEv(ZHx1u5>7Z=ZI@ogER@T`B z=hG?B&YwwHog@Uj)Ja7ob7*dg(di^1a!w}^$sdldTv4M>#<^1>46@PK%IPE_V7D;F zd)C$^a&kIJ<%;{`6B5BAp;6ZTKS|{2#p$FV*N6o&WWwpBAS9el8UnK{j&p?5Nk=kh z;|HgccvzpjDbOBGCk>HqI;ls!(n&=)^hzfQ$v>}jk`Z{Nla4%={oCA&%GK@^CY>~- zsw|OoQV>qrDxG9xltCEl6ir3h&?=oo#HLd^$;cIxh~L(=sStkF67QyTk_zY(x<7E0 z(n&@x3k2HOMS-M~gup`kNhb{fW!H-=km`FD(IQSdNr(u;PC7}2hdz}g5^r>pk@y4J z=p-T(6FrCUv(ZUGZV)s%qx2}llgNrrI`UVxxj!n6P7*TdhcdUcq|r%3SRrplCmoS8 zI%!CB5|Bw`Wmtca$l7?m=%gUgNkm|cywFKO%pwvxX^1Omg-%j=^w!|7GAwVK?>J8A zq#y28HWtTHLMQof6XF|gg|Lj!Nj)IWE(=uQ_@I-3R27wpMj!khbW)J_YOI4!;*oV7 z0!<>QD}%6{Wo~u3Z89)x49F#|JL;lw8-!c-64Se>Mjg8hZnNkt-c zl1i*7n>(kIgg_@1k6h)=dcXp+NIhCk;^{br{Boj@)^3%AKQew|%W{+(FqsJKkKi@!iw_j-}VqDNYlG zwa#3p=JK2>m5*lmrlhmUJLy)flWodMTjZ9< z7$=<8!dYMUxd-c{Qsw4*juQK8iN{dp@MNKlYxDrIsLI+Hr0_v{lxdzJ@g%d3xuhT-<<#l z2+faSD{*|l_a6Q#-{5DRaDbQ(y2wNQvnfoE;1GFFQ`g3xV_0d9>fD3S!J5vT^aq;} zc9Zg`@}@dfZ46@)7paqQTu50sLiY0q9mElaGX%WJ{@oeQkF4XT8F74#!AYIe!FbQu2S(|nBG0*ogcFt~`kKaHbgn4&9_|zk2Ex`wCmHD$Mzg7_zY!RO zRXIwhaz)i(j`91XlZ0Rw85f-72pCVNgJs_?B_6wp)k#Cz2njhtg1$LbKH0aQZgHBn zY4>ESo6?rIo;S{YaV)J)+A-~#6`3itv2I=^ZHaOdg)mAtmBi8 z*Uh%4ecRSlw#XrD^-gob_b|Z0J~mWV9jU?5jc{m0WtW9@Ys>7h&U>dPt#5{owl_`P z9DUiSjJk10H@wlczef8p8;%VWoZyQwYy|D=gF{(_<+M&Rf-$`Jv9QR@bl`i=<<=HH zZB%OAc$BhfIPI1vn14?bla%bSsv1pr-s&VGGUlA)D(}BK)Y3ZtJvedRpr*FjY-OvN z?xtTydGCaMY%r|VNkH;n77eC8R_CvUd6Lh`CmYYuHmYlE_r5pjd;llDe0rupEf~gw zMCWOmThdEoY&pJel3v*+`e2jq^^f%MP*Lt=VuTYcr13dA#-PO-gzz8zUOY(N{jDS2H@erLn{w0E$wu#-+E0_! z(@aW_FpMFLB1|-)&!Q)Qb@qw}?C*R(e2v^UFLb=_^M?zYFbY<6niWaV~m z%2OQFKjDcnZf`~Mk_1l|85i6$sO9oK>79^EPTRYBTg*~6o~PI*d)=j!Cox@XNm`&- z2pxR5Q{mQ@a>`}a&F+}nrBOCH?KgRUPI_aU8a%iH3QABzdBVJB{8(Ky9W)t7{eW?brKLfKWL?sqy@^o zx6Y|fccE>y&fDJRvh6+XMyIn;SAAg_9{q*st(yv2{XHXYU80w8zH+BRFM}x>%*LXr z@(h0RB6WU(SG%8*N7w#J+uWnFy@s-JM{~AR^8f=4wt#|6$K`6{rrhnr>?OAJFMY}`5F?~GEwNW zEO#btE1fUjR^2YQx4k-L%Now+-hF>$jqS$HPi@#7Q&!K z2+;5a350-x2VpoP2T%k-1!%y43S0<=Sg515dN7a_AyY~jGcqzVa@T@ObYxwU%Ag7t_#se9 z|A0Grm=>yZP%7@s_ba~{fq~iq>mqp_*8g>q8FMy5{+oO`(P18)x)}*Ma=<~8XPMpX zm#Pb_qL*;}JW3mUx7b|*TK&v{acH*u+5CEvD&fGc5*qAx>f_@`Z!+G_xSA8AH#WA` zDfJ`6mY}mMGq7iwP1wJ2P?{LqJj}(Q>MlNC%r}LjnjWJx^bX#aG=oOeRMO(nQ#B>* zY>(OK_M|v;P45+b_L`uwxc-7`@8KxQPb2qTF_m@_9v+a|gGkdeP86je4uR?{dXo-?$QP37%U^woj z+<~0K#D#tp%_>(GfS_9ASs5|?@S`JT&y!))G;WM*RVY)=9mOBURi1 z=3f}>U#j_5=lE!Jhaj1Ekb*Y}PJ8w%5F^zU*o{!)`0OcD9u3AyFP>mU41ONnRZr*M zqnBm!4_a6>Hn?Gog9B3OVS$weJ%zjUxgs8mtJc;z-0;ukPPmWNG7yXqVTjNdCa-7{ zCf(SiXfat9A4-SHjq=?D2rP^M=1c}b4c1?10bb7os`ZjH*PZrU_Kwu-oS)0`l7C~B zkWyKiphuUoQBc;}6tZz3GKv*ScEK+jeKNp-7Wz9_s6t_+))FI>13G(yr^DYdRQ(U( zP|P`G*s^fsfDLR{>C3gY=kD=5=6O>r~^}i%SwbxBN$-3)7;1 z-|NH%gsb%C&n$X2G!$LN&cXVux+k$bARJ_H&YoyNlm>OA3|v;D%Mi%w*VoEs*Vx7i zI1n+g;*IiwJvp0n_T+t`9Ut>5)Sv!9AcHV)ekrpwMKBcNMrLQ6Akg-J2qy$Q{q8~< zNpe&?apI%e4mDayjP`Swe9G8>qVS+|uPzJ<9cr)zHusPPr&hK8o6(x3PP{zR~;zFMa(=7wcr_qwZj&CpI`-luL@-*EMNuAI0GFR?FY!i3pTS_1ll3U?f# zMFOOv(K3MBy+b#??APJrh#}*33Z99(3mGOXYQ{(CRDp;fkgcnm21Nj^mT+q{1~#>G z*>XZwI%oSFH}4=eiaBY(KqW6I8lYgWuaK&8Ou&vaC zCS?cA$DWJZH_IzKvIYS+kmeAxj<~#j93>4Y)p-ZIfW)91S)~CMYRCfqofR1GEF#bs z_--Jeq^U^feGAlCiu~K(CCipos|5t=Rlf{@ziMd@_H0)Cm!9BMX$p(_NL&`nSPABA z6G5n>W&P5Eveuhs`9>2%IT!aSt`}E>Q;-u(cZI4zGrI*%VJ6^}fZRg}(lIR^lt!E- zggoFEJC0G;aC`}V={oPP>54GIA9QpSAUtje$6p3I@bi?@p zxQ)D)Br)+;A8=en*~G~aXW;n0Ri`>JK#z>~HK( zs31q19IZ8_J#EmTs8ufkbph`cG1^O{MgN0Slz3HZy0a7GdgiIEY71NlO~vVG8SKyH z#s`3cVzEvU7s%=YI1^$NIgzeB%Jvn-qF-10sS$EIz|rQYgOQYQ^MNBsfe9&L3$aV5 zZcRr?xQEd8XF{Q-WkcL=oCRl}Iv0INo!qTPL}Bhko(7Fcf}gL1N5V>0GE+GR)E+AF zZUa`pA*M})Q#(ZUXfJ{39Q?`@o=TT=n1jyNw%sB}{mX5S@|c#Wt)@^KJRAxydGASi z3sS26DO40@=0}0OAfT%U^Bb9gqRYT$-X9U|sR)8n8pFufI(2C%iz(ota@(or+nUgp z7ghr5oV~3o|uXJt(gnzygvbwjq{NAwkdjx|Wb2Z;_3x z91rM-NU+yD#;lf#3|)?Gy##yFfS;a&*il z2*-#Hknfanv{FMtuWhCFF4WYOPSwi@K|cXKAYVSG9^h~TpM+XjJ%8$7B2?BVlba#0 zM|hm5DE!_vj_yo|PqQmF)exeL)R4|Nw-hO!2XWk>;i(}04%ly?(@Gv$NRpLw-ydXW z&fJNZG2hesC*C56gC)K-h|sTUO0CM{98n5j;Z(u1@7bm8}b9@tSbr3wl;j;Ww5>{zmheeD{`j931zv3hDtx!noY98KClE4{A?GxPl zB#F*uDUBF`&md$$+S?gX5$>Ju`AjNG327mc09zG4xKgk!q2TQ<{p#=h<5ks%5djHZ zFVP|)$N;q1K)TA;7_utN98sS|{W4OJ3R;?KMl?P)KgGtJeyTwO<3>hWS;m_w?PQ zJ-%OBVsB_e7dD@ZooYH&9KW&@-uxlBrPg}2sjFZt3~57IF5PMHPyEvC)RO9vDrR~K z*{~NR3y`xWh^CqYPnS@oAtVIw5b0Ra{i~V=@WGlic!Xa=J+KM^BcRFeIlfNIq`!$= zc;yQ!wq|PtT5B16D=n&XI_b!4bU^_Jz9Ct(0vVDqedXnE-ILl_i*a?M*RI#X=)>nr zU%IilA(-p20+RrfP*=|Rxeb92pP~JXpuuVd(pL&v3)KQQb`J$X&@v}q7su?YL+d?{;$4F;&5}o@0w9=dNpCNu4p*c6w@`itb z*@kXYTpDSTV5XZA_D`xfzNjLg@=5H~@`agju6NwpkJ zI*Z+DVi3V99f?}0MY71t^bgLDe5AvvUc{_4^)F1xaK+!v=cy)=36wN{l6sA?cC?ZW zPJDGK^#NqkL`>M`KJeviK^5ev}=NsxaQM3tY4xrcu#N z0(eJfV%Bzmfr!kwb=3PLBUZa%ss}C@UwWnG?~LStf!^iS?13P>0%`_iSsXK%VeKG0cIZCly4=6kJCZDH5ut9IAvV z`7zk*YNMYHY+E>3+`}RqvYHd1VVfsAX84picyqW`u5=31gMzU?hfqf`4tG>F;t({n z{b;-DXrx~15Sh-SWKh?9qKN;&>iKpf#fGvU7H>oEzV>ND@7TVKwBC;5^99Bpbya)iy6$$FZUjhq?^8)u(2;vdL=H0 zme&Gjza>T@r|$<|n7+Y$S}`MhI3i7BMa1=#Fd@HZ>;FJU!{w9m55d(-FOjRjLVH2k zb=r|qG|jNyC=5^^-q1~hoESenzZD=_E|z^){vP70m&+d~fAzxxen4r72_##E=r%v| zVrepmJy|BNuO8MA~1pqo}!%-3%gswPaFQKv)vQP{}{=c5u^AK(+5#%CbK9-_UIclIh3e zsrU6fVS|rb5XsDh*FX3>(>I3&EQtQX=JP%irjue_+&l&oEi#N|=o#j@d(Lqfd&Ua0 zXDNpk)Z;z42+L76$E^z;+rHNylaPn`?-Jk};Vd>Drv(J*-Y>OGXu>Br`_j7zNK z6P|E3;0LW^DWSLW$WPb?{JP%5{C3Cuop(|K@{ZUo)xOt&cllloF}GNwBm@4Lz`axx6J*2T=#hr@-Y5aWTNOro~U zjiqV(ZL1mr0*g+cr9S6)cY9tWz+1BLASCd2?xU&+!BK{}M9vgygFP5hpys83-+p6^ z`lK&|b(w_vL1~L4woY)ma; zG`;Ni1rU;pVHYbl{Sdsfl}bU6)mA%vh7r)9+v%`r?tu)O|Ed6fH0XYGE)Fe+(wm?3 zdvO5;pfB|2hEZn)&SAjPx?)>up>{aA8aoe6V~U{Gpv**&{Z2`MaYv*(1M~(VsRgJB z%6BmI9;)a4x#K*W3Xc=oDFL}`MI=&xt_ZsJ_E!FO_lg z8B^j_kv#d5B()6-M+W2<6(3W=LP6KWM#l0Q$6{~W^yIo3wO6sSYkyND7d7Z@;2}31 zH219QEM?maUIVeej>kdMt-i7LHRC5=*^DO6O$L#JT!gf_N#hxy9@U_%>rY`YDCtd< z0LDEGY7XH&=Hh8k&nU6l@D{?@QR&GhkKNd(_A0f}y5uTCDrwDnzV>zLql7QoV0|Cp zj22V*q}N=#IE+iaVOTwqrsr1QLml0scGbw8lgl!ifo-uT+_=N0w~+WM#rr0qpE=(H>l4 zjx@c!iC!@=MGR#MhLew>>h_ffy;A9(oihsJUK$w7;V2K-1PlRCC&~en8H?fo$ch)Q zl~>D-wWND`PWV4guyUAjAoT-N8(ByvUnFFXaQZqXIoiP|&Ek`I6hQuDGZ@oH8YqDH z^gq&=%BOD3N%T5=pn~koLNo#tW~$j!Hc=8szt_hDcp2=5P9mnwkmbE4!nBh6HRh%6nP-)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJEP-2E(M?(VVD-- MnG6h!KwW5h01R0YQUCw| literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..ecb2944baacfce617176f68f96181c5d7971096f GIT binary patch literal 184 zcmV;p07w5HiwFP!0000009B5`3c@fDME_+^3OQ7*rRFAr9uyQ6FXADs+a?$iNw#1p z{dc$CybJ>~`(}G+jKv#OntTA6XRAVhM$>8nIT!H4bWLvr z2^R`oO}+JX7v*8p$?m+-i*hQIwP-QQbqFfmxyZ!F00NYAo)q!qKV;0@9#i~n827}_ mNs;El&vP2Qr(@GZE`{iwFP!000000Ie5mPvbc7zx30slv^zi!K&Z7g)RsKZ30;QQ0lj+g%!T#51l2gVzkb(jcgC+_%DXGbkxx*rhp=(rcHYFhC)&egXmut$WE2FB| zyfj9%hjpv~goT*MqGT;}T6!qvOgvHL!A!`4CIp^k0eXNGQjl0Ch{1nC8%3Ego=2CQ z#XuFENlJUMj)zUBo+%P$S_csi|I@$y1B1~9=1W4 zmhq}G)FA6E)dmzP6}mGJ2r^+MsMesYNS`mV6_My3i<#LBb?l_iYTCY=z?!R0jgEXn7b*G6BX*IgeE0~ak#w=+ELQJuD80Ml53=5Ss^UVzr5I|TkU!HUv0`0I1%y;Ys zMQM$J#ioZUd8LUR3eIjrY0ixm_n!2R46$C<;W&l|>`~eF5)%nLHR7BnR-igW2s(So- zHSlZz9ZyId42wxKz?vzI!2_mq*SSHa?uiHWn}sWy$X;c zo!6Dm6B|E*Xxw;cQhOnP(Z{(I21D%O@Eb&Ca*sSg`xFh_%K6EvF)^oQhh7<~OhX8< zvTTs2Pb_i6!IxpijUzV}8gR_e!sg>|W^#*a*qyyGW&uA}x7W)t66J=3(R-NiI@X#C zk0N9l@_KnS`-fcLknfk*-)5Js+yaS#nlmWa zz~o?e`G`$-Y<|n;tIOxegzWF1OT`Mj+GpqeG)F1U=b|827ni;?efoTMaze`7uB#rk zI$zulxNc7GPA>SO~ob23YQv?V634hM9VwiI7MwA~4fEZhljLkHpRF5yA6y+!%d z0{vp`zAD@bbskKS6xG{1YZ=_sY}oz1Ydjo6mGGIvHkUlH6WL#Ux>^^^V2y?w8(K}+ z$Wj=!Z=T)hGVTG6aD0%)vM8khVf1ABGO6F9?o;4}gm2WFyOX(RS(OmGXKwqp_554J z%AX&XbId4I;vM9oAub)Ef~6U8kl8Bv6`fA%wz|mJG>oH`ie_iY+H9 zemhy=NE!e4TiLQK$x1?)>6dnxPP|(0i?mujwqLvyC&U|g*J((eNqBdEG0jPE?D-yg zBx`T*-HWqeRlqflV-l0Xdw`)J3rQdpy}?UkVYbdmI8pOF3N#8BH-;`*~JSPO}sw*D3EEm%j83f+1sg{atE4_-xVph-#DjS!4wb2hDJi zF-}6n8BU9U(VRnqX+i#7ku(6W%4zEWCp66v4ro{?D*;Zklm<8+)J}YuB`~AL7bGP} z^SWF-6@hV;t1B%O?T!S4g#uZW4_E_@@PQ$dN96> zGhCUAvxpO7mJ%(X8X}b91^j`PEQ<+Fg(`|@3@JM@F8v~pX+RiR)FqbLLsU% zr2(pSQLT)gFrzr-%2VmDM0zRY%`;}(q6Mev0&GPL_6*8078T7@snKl3vuK;bN|#3T ziGWMJV72vbMrB0QS}RcFrWX&|j1Nb z{AHG?oi-X?U$lK`NrFcx*htuuQ!P6lRPibnmAXa|!Fr-m_dJp|O>406`Y@29whs z3o9u$4`(FK4#i-S~cdV+$u)yv~;-Jw{je!#jKQemtG^ZQ<9+s3ThO3HW!2 ztL8kzg*c*ep`sNI^uUCe<)$?Q=h|s2c15_dM4oYAt-M@L&4_MrgofddwjfYzth&`%&*Dx{`5^GsYU$78#S#yh45P7Zltq6RH zGj*aN25pa1jtO-{7pr+fIV`N!#3a5#2w?@L-63cmgoz=44?Rnu)?r z3brBGiN71@SI@~}-#w|<4ZYiOcNgtCVRtL*ZbV(3a^=(-kgLlS<4U2)cpC}V3Ah&D z+ITBHIO)~_w^H3fo?Fn?B3o5hCfIJ}S_^BX37cx2P%CvSrnQi^J--yos&h;ySsi4X z)}c6dPl`2q(i`@@8(!6ptXlRh38%+1`xq(Mt;Zwrs1`IVjq=e87{J9H19rWpdPZxV`MNb!bcEQulojb78 zq|VNUb*xLf9D)yZ*6d)WMwu358jNWmrb(CvV0QS@ftK6J(f~^bQ+ANjrpcW!Y3%+E zx#%p{8;U)c6RF z4W+pdPU%ljI6XP(opx{gr{H0A>z?-KH6@fZOkgJ1VTy!3&l3c^(&8V(sQ z;KX)4EVP+uc^CyO;@|hlLPgby$%^JTsXy4MDUQEE*}ScuDu1kcS>mXms$d1s12UV7 z-L1n~@~dmE_QD9X+t3};o@J;ez$m?L;&$40()$;xE0qn2{EoE?llI292h|+3vtwE* zJz+eDXN*fKz9A02d*b5Y@Zhl5?|nRt4iEal>HMHShhJx(j*budhrN^V^yFy%>Fl`r zITPN>@_&hS)zpI> F003c(c;)~A literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc new file mode 100644 index 0000000000000000000000000000000000000000..452d210694c39f4ccd05ec80f57d9f0c9464151a GIT binary patch literal 16 XcmYc;N@ieSU}BgyKUUu-^pp<(BAo=Y literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 new file mode 100644 index 0000000000000000000000000000000000000000..de784fadf70f331163ac2c213fd9b628fe635c38 GIT binary patch literal 928 zcmV;R17G}$0{{T93IG5owJ-f(unFxO0H&yxOVHA)10c2yfW8r;q5J_#KF*vw8u246 zIV(Ca6wRJfKmf7Q*5Np`^w1J@K-mi4{LjQkHnx;oN-3qJCp+OrzNQ%uIl6L&py@4B zd;oL+jQ}Ryv$*Z;8LQmNS>{*FkzCZf+Kf|d(8@muWrcj|DT@_t-kM`uE3T%-D)G_L zRE>W+o9H92Xm93Yt~RzBZNROZ^<1r(Bel3~-FDZTtwwWYyhm>9X{*sncy7QL&kTs7 zG%*{9$^WM`5e7?dbVfhyB$Vv0LC*9vqycoqIcIzmJ`MlrXT5LL=1hc=C__D_A3b6Z zauh65y1`~Jdu_)ZW>*|rT=l)Ht#2PkZg=J5t{h*??RJfrvaTbqc4x~iC#k*OvEos6 zvLenq^Q^Dkc9$qOEIx~y=c_HQrlw+r*wJDUTYNLVR_|S}$tt@a-hr2d2R~iUug9R} z6(P3+>Ug^I<@@z=3v_5;o>Fe(C^XPwS>E&^52Qai>F1CN8ROf%TpmROsGF!VF8?yN z3oyzUGZ0D^9Un>%F*~G%R?jT@myIW0$N&HZ>HmLM;9!Luagp>dso;(-gu=r=SzslA zyg?A+QARjDg26z)|&s%qG`Yp%7SL(SX%L?6Su)pAah=LFBMmjB1tvY zsL(=oNZ|7#sh|&@Goi%)I3tVc9FSZe07q$Q+#sWc{(i$~1OcIN$%LJ-yQ;3%B~hKe zMO(O)VwR^)vC-Cy%Ph~T%^Wax=71<_qbBaeU0jJK%3!HU&!yGx=yrBLy`S#|?D1Al zUob8@9bkZosIe_FG$SKHlq5+C>F`3Kq3IF;;#dY_8IoZU2oW(tVw9xfhF@d0#x!c^M( zV*s(iL|gkKOl~ln3}!Y_2-8)D*TM>tO8(9hk@oYS@COY500000001bpFa00@0RRBl Cu)*>G literal 0 HcmV?d00001 From e5880c90ac83839f7787dc8090f15c507a0288f9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 10 Apr 2025 12:02:29 -0400 Subject: [PATCH 09/55] finish tests --- v03_pipeline/lib/annotations/misc.py | 27 +- v03_pipeline/lib/annotations/misc_test.py | 454 +++++++++++++++++++++- 2 files changed, 468 insertions(+), 13 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index a4d685daf..3e989a1a5 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -23,14 +23,15 @@ def unmap_reference_dataset_annotation_enums( reference_genome: ReferenceGenome, dataset_type: DatasetType, ) -> hl.Table: - formatting_annotation_names = { - fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) - } - removed_enum_names = [] + reference_datasets = ReferenceDataset.for_reference_genome_dataset_type_annotations( + reference_genome, + dataset_type, + ) + unmapped_annotation_name = [] for annotation_name in ht.enums: - if annotation_name in formatting_annotation_names: + if annotation_name not in reference_datasets: continue - for enum_name, enum_values in ht.enums[annotation_name].values(): + for enum_name in ht.enums[annotation_name]: if hasattr(ht[annotation_name], f'{enum_name}_ids'): ht = ht.annotate( **{ @@ -38,7 +39,11 @@ def unmap_reference_dataset_annotation_enums( **{ f'{enum_name}s': ht[annotation_name][ f'{enum_name}_ids' - ].map(lambda idx: enum_values[idx]), # noqa: B023 + ].map( + lambda idx: ht.enums[annotation_name][enum_name][ + idx + ] + ), # noqa: B023 }, ), }, @@ -51,7 +56,7 @@ def unmap_reference_dataset_annotation_enums( **{ annotation_name: ht[annotation_name].annotate( **{ - enum_name: enum_values[ + enum_name: ht.enums[annotation_name][enum_name][ ht[annotation_name][f'{enum_name}_id'] ], }, @@ -61,7 +66,7 @@ def unmap_reference_dataset_annotation_enums( ht = ht.annotate( **{annotation_name: ht[annotation_name].drop(f'{enum_name}_id')}, ) - removed_enum_names.add(enum_name) + unmapped_annotation_name.append(annotation_name) # Explicit clinvar edge case: if hasattr(ht, ReferenceDataset.clinvar.value): @@ -82,7 +87,7 @@ def unmap_reference_dataset_annotation_enums( ), }, ) - return ht.annotate_globals(enums=ht.globals.enums.drop(*removed_enum_names)) + return ht.annotate_globals(enums=ht.globals.enums.drop(*unmapped_annotation_name)) def unmap_formatting_annotation_enums( @@ -150,7 +155,7 @@ def unmap_formatting_annotation_enums( ).drop( 'biotype_id', 'consequence_term_ids', - **( + *( [] if reference_genome == ReferenceGenome.GRCh38 and dataset_type == DatasetType.SNV_INDEL diff --git a/v03_pipeline/lib/annotations/misc_test.py b/v03_pipeline/lib/annotations/misc_test.py index bd1afd143..f877fd01a 100644 --- a/v03_pipeline/lib/annotations/misc_test.py +++ b/v03_pipeline/lib/annotations/misc_test.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.annotations.misc import ( unmap_formatting_annotation_enums, + unmap_reference_dataset_annotation_enums, ) from v03_pipeline.lib.model import ( DatasetType, @@ -16,8 +17,457 @@ class MiscTest(unittest.TestCase): - def unmap_formatting_annotation_enums(self) -> None: + def test_unmap_formatting_annotation_enums(self) -> None: ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) ht = unmap_formatting_annotation_enums( - ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ) + self.assertListEqual( + list(ht.globals.enums.collect()[0].keys()), + [ + 'screen', + 'dbnsfp', + 'clinvar', + 'gnomad_exomes', + 'gnomad_non_coding_constraint', + 'splice_ai', + 'exac', + 'topmed', + 'hgmd', + 'gnomad_genomes', + 'eigen', + ], + ) + self.assertEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', position=939121, reference_genome='GRCh38' + ), + alleles=['C', 'T'], + rg37_locus=hl.Locus( + contig=1, position=874501, reference_genome='GRCh37' + ), + rsid=None, + sorted_transcript_consequences=[ + hl.Struct( + amino_acids='S/L', + canonical=1, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000616016.5:c.1049C>T', + hgvsp='ENSP00000478421.2:p.Ser350Leu', + transcript_id='ENST00000616016', + mane_select='NM_001385641.1', + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_001385641.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000341065.8:c.284C>T', + hgvsp='ENSP00000349216.4:p.Ser95Leu', + transcript_id='ENST00000341065', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=4, total=12), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000342066.8:c.512C>T', + hgvsp='ENSP00000342313.3:p.Ser171Leu', + transcript_id='ENST00000342066', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_152486.4', + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000616125.5:c.512C>T', + hgvsp='ENSP00000484643.1:p.Ser171Leu', + transcript_id='ENST00000616125', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=11), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000617307.5:c.512C>T', + hgvsp='ENSP00000482090.2:p.Ser171Leu', + transcript_id='ENST00000617307', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618181.5:c.461C>T', + hgvsp='ENSP00000480870.1:p.Ser154Leu', + transcript_id='ENST00000618181', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=4, total=10), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618323.5:c.1049C>T', + hgvsp='ENSP00000480678.2:p.Ser350Leu', + transcript_id='ENST00000618323', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_001385640.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618779.5:c.512C>T', + hgvsp='ENSP00000484256.1:p.Ser171Leu', + transcript_id='ENST00000618779', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=12), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000622503.5:c.512C>T', + hgvsp='ENSP00000482138.1:p.Ser171Leu', + transcript_id='ENST00000622503', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + ], + variant_id='1-939121-C-T', + xpos=1000939121, + gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), + CAID='CA502654', + check_ref=False, + sorted_regulatory_feature_consequences=[ + hl.Struct( + regulatory_feature_id='ENSR00000344437', + biotype='CTCF_binding_site', + consequence_terms=['regulatory_region_variant'], + ) + ], + sorted_motif_feature_consequences=[ + hl.Struct( + motif_feature_id='ENSM00493959715', + consequence_terms=['TF_binding_site_variant'], + ) + ], + gnomad_non_coding_constraint=hl.Struct(z_score=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.0006690866430290043, + AN=1440770, + AC=964, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0008023773552849889, + FAF_AF=0.000633420015219599, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=0.0002759889466688037, + AN=152180, + AC=42, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005293028079904616, + FAF_AF=0.0002092500071739778, + Hemi=0, + ), + screen=hl.Struct(region_type_ids=[]), + dbnsfp=hl.Struct( + PrimateAI_score=0.5918066501617432, + fathmm_MKL_coding_score=0.7174800038337708, + CADD_phred=23.5, + SIFT_score=0.0010000000474974513, + REVEL_score=0.3109999895095825, + Polyphen2_HVAR_score=0.164000004529953, + VEST4_score=0.39500001072883606, + MPC_score=0.01291007362306118, + MutPred_score=None, + MutationTaster_pred_id=0, + ), + topmed=hl.Struct( + AC=41, AF=0.00032651599030941725, AN=125568, Hom=0, Het=41 + ), + exac=hl.Struct( + AF_POPMAX=0.0007150234305299819, + AF=0.00019039999460801482, + AC_Adj=20, + AC_Het=20, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=47974, + ), + splice_ai=hl.Struct(delta_score=0.0, splice_consequence_id=4), + eigen=hl.Struct(Eigen_phred=2.628000020980835), + clinvar=hl.Struct( + alleleId=929885, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], + conditions=['not provided'], + assertion_ids=[], + pathogenicity_id=12, + ), + ), + ) + + def test_unmap_reference_dataset_annotation_enums(self) -> None: + ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) + ht = unmap_reference_dataset_annotation_enums( + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ) + self.assertListEqual( + list(ht.globals.enums.collect()[0].keys()), + [ + 'sorted_motif_feature_consequences', + 'sorted_regulatory_feature_consequences', + 'sorted_transcript_consequences', + ], + ) + self.assertEqual( + ht.drop( + 'sorted_transcript_consequences', + 'sorted_regulatory_feature_consequences', + 'sorted_motif_feature_consequences', + ).collect()[0], + hl.Struct( + locus=hl.Locus(contig='chr1', position=939121, reference_genome='GRCh38'), + alleles=['C', 'T'], + rg37_locus=hl.Locus(contig=1, position=874501, reference_genome='GRCh37'), + rsid=None, + variant_id='1-939121-C-T', + xpos=1000939121, + gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), + CAID='CA502654', + check_ref=False, + gnomad_non_coding_constraint=hl.Struct(z_score=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.0006690866430290043, + AN=1440770, + AC=964, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0008023773552849889, + FAF_AF=0.000633420015219599, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=0.0002759889466688037, + AN=152180, + AC=42, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005293028079904616, + FAF_AF=0.0002092500071739778, + Hemi=0, + ), + screen=hl.Struct(region_types=[]), + dbnsfp=hl.Struct( + PrimateAI_score=0.5918066501617432, + fathmm_MKL_coding_score=0.7174800038337708, + CADD_phred=23.5, + SIFT_score=0.0010000000474974513, + REVEL_score=0.3109999895095825, + Polyphen2_HVAR_score=0.164000004529953, + VEST4_score=0.39500001072883606, + MPC_score=0.01291007362306118, + MutPred_score=None, + MutationTaster_pred='D', + ), + topmed=hl.Struct( + AC=41, AF=0.00032651599030941725, AN=125568, Hom=0, Het=41 + ), + exac=hl.Struct( + AF_POPMAX=0.0007150234305299819, + AF=0.00019039999460801482, + AC_Adj=20, + AC_Het=20, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=47974, + ), + splice_ai=hl.Struct(delta_score=0.0, splice_consequence='No consequence'), + eigen=hl.Struct(Eigen_phred=2.628000020980835), + clinvar=hl.Struct( + alleleId=929885, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], + conditions=['not provided'], + assertions=[], + pathogenicity='Uncertain_significance', + ), + ), ) From 2c60e100ce392b1dd6fc65772b4c5d1fe6f3242e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 10 Apr 2025 12:15:26 -0400 Subject: [PATCH 10/55] ruff --- v03_pipeline/lib/annotations/misc.py | 6 +-- v03_pipeline/lib/annotations/misc_test.py | 59 ++++++++++++++++------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index 3e989a1a5..c4ccfd7d4 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -40,10 +40,10 @@ def unmap_reference_dataset_annotation_enums( f'{enum_name}s': ht[annotation_name][ f'{enum_name}_ids' ].map( - lambda idx: ht.enums[annotation_name][enum_name][ + lambda idx: ht.enums[annotation_name][enum_name][ # noqa: B023 idx - ] - ), # noqa: B023 + ], + ), }, ), }, diff --git a/v03_pipeline/lib/annotations/misc_test.py b/v03_pipeline/lib/annotations/misc_test.py index f877fd01a..0e57d0894 100644 --- a/v03_pipeline/lib/annotations/misc_test.py +++ b/v03_pipeline/lib/annotations/misc_test.py @@ -44,11 +44,15 @@ def test_unmap_formatting_annotation_enums(self) -> None: ht.collect()[0], hl.Struct( locus=hl.Locus( - contig='chr1', position=939121, reference_genome='GRCh38' + contig='chr1', + position=939121, + reference_genome='GRCh38', ), alleles=['C', 'T'], rg37_locus=hl.Locus( - contig=1, position=874501, reference_genome='GRCh37' + contig=1, + position=874501, + reference_genome='GRCh37', ), rsid=None, sorted_transcript_consequences=[ @@ -68,7 +72,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=None), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -96,7 +100,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=None), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -124,7 +128,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=0.1467999964952469), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -152,7 +156,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=0.1467999964952469), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -180,7 +184,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=0.1467999964952469), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -208,7 +212,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=None), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -236,7 +240,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=None), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -264,7 +268,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=0.1467999964952469), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -292,7 +296,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: alphamissense=hl.Struct(pathogenicity=0.1467999964952469), loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False + extended_intronic_splice_region_variant=False, ), utrannotator=hl.Struct( existing_inframe_oorfs=None, @@ -315,13 +319,13 @@ def test_unmap_formatting_annotation_enums(self) -> None: regulatory_feature_id='ENSR00000344437', biotype='CTCF_binding_site', consequence_terms=['regulatory_region_variant'], - ) + ), ], sorted_motif_feature_consequences=[ hl.Struct( motif_feature_id='ENSM00493959715', consequence_terms=['TF_binding_site_variant'], - ) + ), ], gnomad_non_coding_constraint=hl.Struct(z_score=None), hgmd=None, @@ -357,7 +361,11 @@ def test_unmap_formatting_annotation_enums(self) -> None: MutationTaster_pred_id=0, ), topmed=hl.Struct( - AC=41, AF=0.00032651599030941725, AN=125568, Hom=0, Het=41 + AC=41, + AF=0.00032651599030941725, + AN=125568, + Hom=0, + Het=41, ), exac=hl.Struct( AF_POPMAX=0.0007150234305299819, @@ -404,9 +412,17 @@ def test_unmap_reference_dataset_annotation_enums(self) -> None: 'sorted_motif_feature_consequences', ).collect()[0], hl.Struct( - locus=hl.Locus(contig='chr1', position=939121, reference_genome='GRCh38'), + locus=hl.Locus( + contig='chr1', + position=939121, + reference_genome='GRCh38', + ), alleles=['C', 'T'], - rg37_locus=hl.Locus(contig=1, position=874501, reference_genome='GRCh37'), + rg37_locus=hl.Locus( + contig=1, + position=874501, + reference_genome='GRCh37', + ), rsid=None, variant_id='1-939121-C-T', xpos=1000939121, @@ -447,7 +463,11 @@ def test_unmap_reference_dataset_annotation_enums(self) -> None: MutationTaster_pred='D', ), topmed=hl.Struct( - AC=41, AF=0.00032651599030941725, AN=125568, Hom=0, Het=41 + AC=41, + AF=0.00032651599030941725, + AN=125568, + Hom=0, + Het=41, ), exac=hl.Struct( AF_POPMAX=0.0007150234305299819, @@ -458,7 +478,10 @@ def test_unmap_reference_dataset_annotation_enums(self) -> None: AC_Hemi=None, AN_Adj=47974, ), - splice_ai=hl.Struct(delta_score=0.0, splice_consequence='No consequence'), + splice_ai=hl.Struct( + delta_score=0.0, + splice_consequence='No consequence', + ), eigen=hl.Struct(Eigen_phred=2.628000020980835), clinvar=hl.Struct( alleleId=929885, From 21e2d953cb362d87d91244678d11f74f4a85954a Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 17 Apr 2025 19:22:15 -0400 Subject: [PATCH 11/55] Add camelcase --- v03_pipeline/lib/tasks/exports/misc.py | 46 +++++++++++++++++++++ v03_pipeline/lib/tasks/exports/misc_test.py | 29 +++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 v03_pipeline/lib/tasks/exports/misc.py create mode 100644 v03_pipeline/lib/tasks/exports/misc_test.py diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py new file mode 100644 index 000000000..c5410345d --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -0,0 +1,46 @@ +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def snake_to_camelcase(snake_string: str): + components = snake_string.split('_') + return components[0] + ''.join(x.title() for x in components[1:]) + + +def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: + return s.rename({f: snake_to_camelcase(f) for f in s.keys()}) + + +def camelcase_array_structexpression_fields( + ht: hl.Table, + reference_genome: ReferenceGenome, +): + for field in ht.row: + if not isinstance( + ht[field], + hl.expr.expressions.typed_expressions.ArrayStructExpression, + ): + continue + ht = ht.transmute( + **{ + snake_to_camelcase(field): ht[field].map( + lambda c: camelcase_hl_struct(c), + ), + }, + ) + + # Custom handling of nested sorted_transcript_consequences fields for GRCh38 + if ( + reference_genome == ReferenceGenome.GRCh38 + and 'sortedTranscriptConsequences' in ht.row + ): + ht = ht.annotate( + sortedTranscriptConsequences=ht.sortedTranscriptConsequences.map( + lambda s: s.annotate( + loftee=camelcase_hl_struct(s.loftee), + utrannotator=camelcase_hl_struct(s.utrannotator), + ), + ), + ) + return ht diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py new file mode 100644 index 000000000..17fce9fbd --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -0,0 +1,29 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.annotations.misc import ( + unmap_formatting_annotation_enums, + unmap_reference_dataset_annotation_enums, +) +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, +) +from v03_pipeline.lib.tasks.exports.misc import camelcase_array_structexpression_fields + +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) + + +class MiscTest(unittest.TestCase): + def test_camelcase_array_structexpression_fields(self) -> None: + ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) + ht = unmap_formatting_annotation_enums( + ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ) + ht = unmap_reference_dataset_annotation_enums( + ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ) + ht = camelcase_array_structexpression_fields(ht, ReferenceGenome.GRCh38) From 2a052417c30f73da83a32142ec90d4699f46458b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 17 Apr 2025 22:02:26 -0400 Subject: [PATCH 12/55] add camelcase --- v03_pipeline/lib/tasks/exports/misc_test.py | 353 +++++++++++++++++++- 1 file changed, 351 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index 17fce9fbd..b3ad958b4 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -21,9 +21,358 @@ class MiscTest(unittest.TestCase): def test_camelcase_array_structexpression_fields(self) -> None: ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) ht = unmap_formatting_annotation_enums( - ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, ) ht = unmap_reference_dataset_annotation_enums( - ht, ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, ) ht = camelcase_array_structexpression_fields(ht, ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', position=939121, reference_genome='GRCh38' + ), + alleles=['C', 'T'], + rg37_locus=hl.Locus(contig=1, position=874501, reference_genome='GRCh37'), + rsid=None, + variant_id='1-939121-C-T', + xpos=1000939121, + gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), + CAID='CA502654', + check_ref=False, + gnomad_non_coding_constraint=hl.Struct(z_score=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.0006690866430290043, + AN=1440770, + AC=964, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0008023773552849889, + FAF_AF=0.000633420015219599, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=0.0002759889466688037, + AN=152180, + AC=42, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005293028079904616, + FAF_AF=0.0002092500071739778, + Hemi=0, + ), + screen=hl.Struct(region_types=[]), + dbnsfp=hl.Struct( + PrimateAI_score=0.5918066501617432, + fathmm_MKL_coding_score=0.7174800038337708, + CADD_phred=23.5, + SIFT_score=0.0010000000474974513, + REVEL_score=0.3109999895095825, + Polyphen2_HVAR_score=0.164000004529953, + VEST4_score=0.39500001072883606, + MPC_score=0.01291007362306118, + MutPred_score=None, + MutationTaster_pred='D', + ), + topmed=hl.Struct( + AC=41, + AF=0.00032651599030941725, + AN=125568, + Hom=0, + Het=41, + ), + exac=hl.Struct( + AF_POPMAX=0.0007150234305299819, + AF=0.00019039999460801482, + AC_Adj=20, + AC_Het=20, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=47974, + ), + splice_ai=hl.Struct( + delta_score=0.0, splice_consequence='No consequence' + ), + eigen=hl.Struct(Eigen_phred=2.628000020980835), + clinvar=hl.Struct( + alleleId=929885, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], + conditions=['not provided'], + assertions=[], + pathogenicity='Uncertain_significance', + ), + sortedTranscriptConsequences=[ + hl.Struct( + aminoAcids='S/L', + canonical=1, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000616016.5:c.1049C>T', + hgvsp='ENSP00000478421.2:p.Ser350Leu', + transcriptId='ENST00000616016', + maneSelect='NM_001385641.1', + manePlusClinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseqTranscriptId='NM_001385641.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000341065.8:c.284C>T', + hgvsp='ENSP00000349216.4:p.Ser95Leu', + transcriptId='ENST00000341065', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=4, total=12), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000342066.8:c.512C>T', + hgvsp='ENSP00000342313.3:p.Ser171Leu', + transcriptId='ENST00000342066', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseqTranscriptId='NM_152486.4', + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000616125.5:c.512C>T', + hgvsp='ENSP00000484643.1:p.Ser171Leu', + transcriptId='ENST00000616125', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=5, total=11), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000617307.5:c.512C>T', + hgvsp='ENSP00000482090.2:p.Ser171Leu', + transcriptId='ENST00000617307', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000618181.5:c.461C>T', + hgvsp='ENSP00000480870.1:p.Ser154Leu', + transcriptId='ENST00000618181', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=4, total=10), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000618323.5:c.1049C>T', + hgvsp='ENSP00000480678.2:p.Ser350Leu', + transcriptId='ENST00000618323', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseqTranscriptId='NM_001385640.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000618779.5:c.512C>T', + hgvsp='ENSP00000484256.1:p.Ser171Leu', + transcriptId='ENST00000618779', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=5, total=12), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + hl.Struct( + aminoAcids='S/L', + canonical=None, + codons='tCg/tTg', + geneId='ENSG00000187634', + hgvsc='ENST00000622503.5:c.512C>T', + hgvsp='ENSP00000482138.1:p.Ser171Leu', + transcriptId='ENST00000622503', + maneSelect=None, + manePlusClinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseqTranscriptId=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(isLofNagnag=None, lofFilters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existingInframeOorfs=None, + existingOutofframeOorfs=None, + existingUorfs=None, + fiveutrAnnotation=None, + fiveutrConsequence=None, + ), + biotype='protein_coding', + consequenceTerms=['missense_variant'], + ), + ], + sortedRegulatoryFeatureConsequences=[ + hl.Struct( + regulatoryFeatureId='ENSR00000344437', + biotype='CTCF_binding_site', + consequenceTerms=['regulatory_region_variant'], + ), + ], + sortedMotifFeatureConsequences=[ + hl.Struct( + motifFeatureId='ENSM00493959715', + consequenceTerms=['TF_binding_site_variant'], + ), + ], + ), + ) From 236d5bb15b40f19ff3484f83c2dcdfefcd3e14fc Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 18 Apr 2025 14:02:16 -0400 Subject: [PATCH 13/55] ruff --- v03_pipeline/lib/tasks/exports/misc.py | 2 +- v03_pipeline/lib/tasks/exports/misc_test.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index c5410345d..483195477 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -9,7 +9,7 @@ def snake_to_camelcase(snake_string: str): def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: - return s.rename({f: snake_to_camelcase(f) for f in s.keys()}) + return s.rename({f: snake_to_camelcase(f) for f in s}) def camelcase_array_structexpression_fields( diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index b3ad958b4..0290433a6 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -35,10 +35,16 @@ def test_camelcase_array_structexpression_fields(self) -> None: ht.collect()[0], hl.Struct( locus=hl.Locus( - contig='chr1', position=939121, reference_genome='GRCh38' + contig='chr1', + position=939121, + reference_genome='GRCh38', ), alleles=['C', 'T'], - rg37_locus=hl.Locus(contig=1, position=874501, reference_genome='GRCh37'), + rg37_locus=hl.Locus( + contig=1, + position=874501, + reference_genome='GRCh37', + ), rsid=None, variant_id='1-939121-C-T', xpos=1000939121, @@ -95,7 +101,8 @@ def test_camelcase_array_structexpression_fields(self) -> None: AN_Adj=47974, ), splice_ai=hl.Struct( - delta_score=0.0, splice_consequence='No consequence' + delta_score=0.0, + splice_consequence='No consequence', ), eigen=hl.Struct(Eigen_phred=2.628000020980835), clinvar=hl.Struct( From 135d0b37e2b21098da79d835918831d45b208d9f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 18 Apr 2025 14:08:46 -0400 Subject: [PATCH 14/55] missing init py --- v03_pipeline/lib/tasks/exports/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 v03_pipeline/lib/tasks/exports/__init__.py diff --git a/v03_pipeline/lib/tasks/exports/__init__.py b/v03_pipeline/lib/tasks/exports/__init__.py new file mode 100644 index 000000000..e69de29bb From 085e8860f900f8c8424691487438612ef5b073c0 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 18 Apr 2025 14:42:25 -0400 Subject: [PATCH 15/55] move functions to export --- v03_pipeline/lib/annotations/misc.py | 181 ------- v03_pipeline/lib/annotations/misc_test.py | 496 -------------------- v03_pipeline/lib/tasks/exports/misc.py | 196 +++++++- v03_pipeline/lib/tasks/exports/misc_test.py | 489 ++++++++++++++++++- 4 files changed, 678 insertions(+), 684 deletions(-) delete mode 100644 v03_pipeline/lib/annotations/misc_test.py diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index c4ccfd7d4..e20e07d67 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -15,187 +15,6 @@ ) from v03_pipeline.lib.model import DatasetType from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset - - -def unmap_reference_dataset_annotation_enums( - ht: hl.Table, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, -) -> hl.Table: - reference_datasets = ReferenceDataset.for_reference_genome_dataset_type_annotations( - reference_genome, - dataset_type, - ) - unmapped_annotation_name = [] - for annotation_name in ht.enums: - if annotation_name not in reference_datasets: - continue - for enum_name in ht.enums[annotation_name]: - if hasattr(ht[annotation_name], f'{enum_name}_ids'): - ht = ht.annotate( - **{ - annotation_name: ht[annotation_name].annotate( - **{ - f'{enum_name}s': ht[annotation_name][ - f'{enum_name}_ids' - ].map( - lambda idx: ht.enums[annotation_name][enum_name][ # noqa: B023 - idx - ], - ), - }, - ), - }, - ) - ht = ht.annotate( - **{annotation_name: ht[annotation_name].drop(f'{enum_name}_ids')}, - ) - else: - ht = ht.annotate( - **{ - annotation_name: ht[annotation_name].annotate( - **{ - enum_name: ht.enums[annotation_name][enum_name][ - ht[annotation_name][f'{enum_name}_id'] - ], - }, - ), - }, - ) - ht = ht.annotate( - **{annotation_name: ht[annotation_name].drop(f'{enum_name}_id')}, - ) - unmapped_annotation_name.append(annotation_name) - - # Explicit clinvar edge case: - if hasattr(ht, ReferenceDataset.clinvar.value): - ht = ht.annotate( - **{ - ReferenceDataset.clinvar.value: ht[ - ReferenceDataset.clinvar.value - ].annotate( - conflictingPathogenicities=ht[ - ReferenceDataset.clinvar.value - ].conflictingPathogenicities.map( - lambda s: s.annotate( - pathogenicity=ht.enums.clinvar.pathogenicity[ - s.pathogenicity_id - ], - ).drop('pathogenicity_id'), - ), - ), - }, - ) - return ht.annotate_globals(enums=ht.globals.enums.drop(*unmapped_annotation_name)) - - -def unmap_formatting_annotation_enums( - ht: hl.Table, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, -) -> hl.Table: - formatting_annotation_names = { - fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) - } - if 'sorted_motif_feature_consequences' in formatting_annotation_names: - ht = ht.annotate( - sorted_motif_feature_consequences=ht.sorted_motif_feature_consequences.map( - lambda c: c.annotate( - consequence_terms=c.consequence_term_ids.map( - lambda tid: hl.array(MOTIF_CONSEQUENCE_TERMS)[tid], - ), - ).drop('consequence_term_ids'), - ), - ) - ht = ht.annotate_globals( - enums=ht.enums.drop('sorted_motif_feature_consequences'), - ) - if 'sorted_regulatory_feature_consequences' in formatting_annotation_names: - ht = ht.annotate( - sorted_regulatory_feature_consequences=ht.sorted_regulatory_feature_consequences.map( - lambda c: c.annotate( - biotype=hl.array(REGULATORY_BIOTYPES)[c.biotype_id], - consequence_terms=c.consequence_term_ids.map( - lambda tid: hl.array(REGULATORY_CONSEQUENCE_TERMS)[tid], - ), - ).drop('biotype_id', 'consequence_term_ids'), - ), - ) - ht = ht.annotate_globals( - enums=ht.enums.drop('sorted_regulatory_feature_consequences'), - ) - if 'sorted_transcript_consequences' in formatting_annotation_names: - ht = ht.annotate( - sorted_transcript_consequences=ht.sorted_transcript_consequences.map( - lambda c: c.annotate( - biotype=hl.array(BIOTYPES)[c.biotype_id], - consequence_terms=c.consequence_term_ids.map( - lambda tid: hl.array(TRANSCRIPT_CONSEQUENCE_TERMS)[tid], - ), - **{ - 'loftee': c.loftee.annotate( - lof_filters=c.loftee.lof_filter_ids.map( - lambda fid: hl.array(LOF_FILTERS)[fid], - ), - ).drop('lof_filter_ids'), - 'utrannotator': c.utrannotator.annotate( - fiveutr_consequence=hl.array(FIVEUTR_CONSEQUENCES)[ - c.utrannotator.fiveutr_consequence_id - ], - ).drop('fiveutr_consequence_id'), - } - if reference_genome == ReferenceGenome.GRCh38 - and dataset_type == DatasetType.SNV_INDEL - else { - 'lof_filters': c.lof_filter_ids.map( - lambda fid: hl.array(LOF_FILTERS)[fid], - ), - }, - ).drop( - 'biotype_id', - 'consequence_term_ids', - *( - [] - if reference_genome == ReferenceGenome.GRCh38 - and dataset_type == DatasetType.SNV_INDEL - else [ - 'lof_filter_ids', - ] - ), - ), - ), - ) - ht = ht.annotate_globals(enums=ht.enums.drop('sorted_transcript_consequences')) - if 'mitotip' in formatting_annotation_names: - ht = ht.annotate( - mitotip=hl.Struct( - trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ - ht.mitotip.trna_prediction_id - ], - ), - ) - ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) - if 'sv_type_id' in formatting_annotation_names: - ht = ht.annotate(sv_type=hl.array(SV_TYPES)[ht.sv_type_id]).drop('sv_type_id') - ht = ht.annotate_globals(enums=ht.enums.drop('sv_type')) - if 'sv_type_detail_id' in formatting_annotation_names: - ht = ht.annotate( - sv_type_detail=hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id], - ).drop('sv_type_detail_id') - ht = ht.annotate_globals(enums=ht.enums.drop('sv_type_detail')) - if 'sorted_gene_consequences' in formatting_annotation_names: - ht = ht.annotate( - sorted_gene_consequences=ht.sorted_gene_consequences.map( - lambda c: c.annotate( - major_consequence=hl.array(SV_CONSEQUENCE_RANKS)[ - c.major_consequence_id - ], - ).drop('major_consequence_id'), - ), - ) - ht = ht.annotate_globals(enums=ht.enums.drop('sorted_gene_consequences')) - return ht def annotate_formatting_annotation_enum_globals( diff --git a/v03_pipeline/lib/annotations/misc_test.py b/v03_pipeline/lib/annotations/misc_test.py deleted file mode 100644 index 0e57d0894..000000000 --- a/v03_pipeline/lib/annotations/misc_test.py +++ /dev/null @@ -1,496 +0,0 @@ -import unittest - -import hail as hl - -from v03_pipeline.lib.annotations.misc import ( - unmap_formatting_annotation_enums, - unmap_reference_dataset_annotation_enums, -) -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceGenome, -) - -TEST_SNV_INDEL_ANNOTATIONS = ( - 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' -) - - -class MiscTest(unittest.TestCase): - def test_unmap_formatting_annotation_enums(self) -> None: - ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) - ht = unmap_formatting_annotation_enums( - ht, - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ) - self.assertListEqual( - list(ht.globals.enums.collect()[0].keys()), - [ - 'screen', - 'dbnsfp', - 'clinvar', - 'gnomad_exomes', - 'gnomad_non_coding_constraint', - 'splice_ai', - 'exac', - 'topmed', - 'hgmd', - 'gnomad_genomes', - 'eigen', - ], - ) - self.assertEqual( - ht.collect()[0], - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=939121, - reference_genome='GRCh38', - ), - alleles=['C', 'T'], - rg37_locus=hl.Locus( - contig=1, - position=874501, - reference_genome='GRCh37', - ), - rsid=None, - sorted_transcript_consequences=[ - hl.Struct( - amino_acids='S/L', - canonical=1, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000616016.5:c.1049C>T', - hgvsp='ENSP00000478421.2:p.Ser350Leu', - transcript_id='ENST00000616016', - mane_select='NM_001385641.1', - mane_plus_clinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseq_transcript_id='NM_001385641.1', - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000341065.8:c.284C>T', - hgvsp='ENSP00000349216.4:p.Ser95Leu', - transcript_id='ENST00000341065', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=4, total=12), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000342066.8:c.512C>T', - hgvsp='ENSP00000342313.3:p.Ser171Leu', - transcript_id='ENST00000342066', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseq_transcript_id='NM_152486.4', - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000616125.5:c.512C>T', - hgvsp='ENSP00000484643.1:p.Ser171Leu', - transcript_id='ENST00000616125', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=11), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000617307.5:c.512C>T', - hgvsp='ENSP00000482090.2:p.Ser171Leu', - transcript_id='ENST00000617307', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618181.5:c.461C>T', - hgvsp='ENSP00000480870.1:p.Ser154Leu', - transcript_id='ENST00000618181', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=4, total=10), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618323.5:c.1049C>T', - hgvsp='ENSP00000480678.2:p.Ser350Leu', - transcript_id='ENST00000618323', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseq_transcript_id='NM_001385640.1', - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618779.5:c.512C>T', - hgvsp='ENSP00000484256.1:p.Ser171Leu', - transcript_id='ENST00000618779', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=12), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000622503.5:c.512C>T', - hgvsp='ENSP00000482138.1:p.Ser171Leu', - transcript_id='ENST00000622503', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - ], - variant_id='1-939121-C-T', - xpos=1000939121, - gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), - CAID='CA502654', - check_ref=False, - sorted_regulatory_feature_consequences=[ - hl.Struct( - regulatory_feature_id='ENSR00000344437', - biotype='CTCF_binding_site', - consequence_terms=['regulatory_region_variant'], - ), - ], - sorted_motif_feature_consequences=[ - hl.Struct( - motif_feature_id='ENSM00493959715', - consequence_terms=['TF_binding_site_variant'], - ), - ], - gnomad_non_coding_constraint=hl.Struct(z_score=None), - hgmd=None, - gnomad_exomes=hl.Struct( - AF=0.0006690866430290043, - AN=1440770, - AC=964, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0008023773552849889, - FAF_AF=0.000633420015219599, - Hemi=0, - ), - gnomad_genomes=hl.Struct( - AF=0.0002759889466688037, - AN=152180, - AC=42, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0005293028079904616, - FAF_AF=0.0002092500071739778, - Hemi=0, - ), - screen=hl.Struct(region_type_ids=[]), - dbnsfp=hl.Struct( - PrimateAI_score=0.5918066501617432, - fathmm_MKL_coding_score=0.7174800038337708, - CADD_phred=23.5, - SIFT_score=0.0010000000474974513, - REVEL_score=0.3109999895095825, - Polyphen2_HVAR_score=0.164000004529953, - VEST4_score=0.39500001072883606, - MPC_score=0.01291007362306118, - MutPred_score=None, - MutationTaster_pred_id=0, - ), - topmed=hl.Struct( - AC=41, - AF=0.00032651599030941725, - AN=125568, - Hom=0, - Het=41, - ), - exac=hl.Struct( - AF_POPMAX=0.0007150234305299819, - AF=0.00019039999460801482, - AC_Adj=20, - AC_Het=20, - AC_Hom=0, - AC_Hemi=None, - AN_Adj=47974, - ), - splice_ai=hl.Struct(delta_score=0.0, splice_consequence_id=4), - eigen=hl.Struct(Eigen_phred=2.628000020980835), - clinvar=hl.Struct( - alleleId=929885, - conflictingPathogenicities=None, - goldStars=1, - submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], - conditions=['not provided'], - assertion_ids=[], - pathogenicity_id=12, - ), - ), - ) - - def test_unmap_reference_dataset_annotation_enums(self) -> None: - ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) - ht = unmap_reference_dataset_annotation_enums( - ht, - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ) - self.assertListEqual( - list(ht.globals.enums.collect()[0].keys()), - [ - 'sorted_motif_feature_consequences', - 'sorted_regulatory_feature_consequences', - 'sorted_transcript_consequences', - ], - ) - self.assertEqual( - ht.drop( - 'sorted_transcript_consequences', - 'sorted_regulatory_feature_consequences', - 'sorted_motif_feature_consequences', - ).collect()[0], - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=939121, - reference_genome='GRCh38', - ), - alleles=['C', 'T'], - rg37_locus=hl.Locus( - contig=1, - position=874501, - reference_genome='GRCh37', - ), - rsid=None, - variant_id='1-939121-C-T', - xpos=1000939121, - gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), - CAID='CA502654', - check_ref=False, - gnomad_non_coding_constraint=hl.Struct(z_score=None), - hgmd=None, - gnomad_exomes=hl.Struct( - AF=0.0006690866430290043, - AN=1440770, - AC=964, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0008023773552849889, - FAF_AF=0.000633420015219599, - Hemi=0, - ), - gnomad_genomes=hl.Struct( - AF=0.0002759889466688037, - AN=152180, - AC=42, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0005293028079904616, - FAF_AF=0.0002092500071739778, - Hemi=0, - ), - screen=hl.Struct(region_types=[]), - dbnsfp=hl.Struct( - PrimateAI_score=0.5918066501617432, - fathmm_MKL_coding_score=0.7174800038337708, - CADD_phred=23.5, - SIFT_score=0.0010000000474974513, - REVEL_score=0.3109999895095825, - Polyphen2_HVAR_score=0.164000004529953, - VEST4_score=0.39500001072883606, - MPC_score=0.01291007362306118, - MutPred_score=None, - MutationTaster_pred='D', - ), - topmed=hl.Struct( - AC=41, - AF=0.00032651599030941725, - AN=125568, - Hom=0, - Het=41, - ), - exac=hl.Struct( - AF_POPMAX=0.0007150234305299819, - AF=0.00019039999460801482, - AC_Adj=20, - AC_Het=20, - AC_Hom=0, - AC_Hemi=None, - AN_Adj=47974, - ), - splice_ai=hl.Struct( - delta_score=0.0, - splice_consequence='No consequence', - ), - eigen=hl.Struct(Eigen_phred=2.628000020980835), - clinvar=hl.Struct( - alleleId=929885, - conflictingPathogenicities=None, - goldStars=1, - submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], - conditions=['not provided'], - assertions=[], - pathogenicity='Uncertain_significance', - ), - ), - ) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 483195477..171cd98b8 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -1,6 +1,20 @@ import hail as hl -from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.annotations.enums import ( + BIOTYPES, + FIVEUTR_CONSEQUENCES, + LOF_FILTERS, + MITOTIP_PATHOGENICITIES, + MOTIF_CONSEQUENCE_TERMS, + REGULATORY_BIOTYPES, + REGULATORY_CONSEQUENCE_TERMS, + SV_CONSEQUENCE_RANKS, + SV_TYPE_DETAILS, + SV_TYPES, + TRANSCRIPT_CONSEQUENCE_TERMS, +) +from v03_pipeline.lib.model.definitions import DatasetType, ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset def snake_to_camelcase(snake_string: str): @@ -44,3 +58,183 @@ def camelcase_array_structexpression_fields( ), ) return ht + + +def unmap_reference_dataset_annotation_enums( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> hl.Table: + reference_datasets = ReferenceDataset.for_reference_genome_dataset_type_annotations( + reference_genome, + dataset_type, + ) + unmapped_annotation_name = [] + for annotation_name in ht.enums: + if annotation_name not in reference_datasets: + continue + for enum_name in ht.enums[annotation_name]: + if hasattr(ht[annotation_name], f'{enum_name}_ids'): + ht = ht.annotate( + **{ + annotation_name: ht[annotation_name].annotate( + **{ + f'{enum_name}s': ht[annotation_name][ + f'{enum_name}_ids' + ].map( + lambda idx: ht.enums[annotation_name][enum_name][ # noqa: B023 + idx + ], + ), + }, + ), + }, + ) + ht = ht.annotate( + **{annotation_name: ht[annotation_name].drop(f'{enum_name}_ids')}, + ) + else: + ht = ht.annotate( + **{ + annotation_name: ht[annotation_name].annotate( + **{ + enum_name: ht.enums[annotation_name][enum_name][ + ht[annotation_name][f'{enum_name}_id'] + ], + }, + ), + }, + ) + ht = ht.annotate( + **{annotation_name: ht[annotation_name].drop(f'{enum_name}_id')}, + ) + unmapped_annotation_name.append(annotation_name) + + # Explicit clinvar edge case: + if hasattr(ht, ReferenceDataset.clinvar.value): + ht = ht.annotate( + **{ + ReferenceDataset.clinvar.value: ht[ + ReferenceDataset.clinvar.value + ].annotate( + conflictingPathogenicities=ht[ + ReferenceDataset.clinvar.value + ].conflictingPathogenicities.map( + lambda s: s.annotate( + pathogenicity=ht.enums.clinvar.pathogenicity[ + s.pathogenicity_id + ], + ).drop('pathogenicity_id'), + ), + ), + }, + ) + return ht.annotate_globals(enums=ht.globals.enums.drop(*unmapped_annotation_name)) + + +def unmap_formatting_annotation_enums( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> hl.Table: + formatting_annotation_names = { + fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) + } + if 'sorted_motif_feature_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_motif_feature_consequences=ht.sorted_motif_feature_consequences.map( + lambda c: c.annotate( + consequence_terms=c.consequence_term_ids.map( + lambda tid: hl.array(MOTIF_CONSEQUENCE_TERMS)[tid], + ), + ).drop('consequence_term_ids'), + ), + ) + ht = ht.annotate_globals( + enums=ht.enums.drop('sorted_motif_feature_consequences'), + ) + if 'sorted_regulatory_feature_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_regulatory_feature_consequences=ht.sorted_regulatory_feature_consequences.map( + lambda c: c.annotate( + biotype=hl.array(REGULATORY_BIOTYPES)[c.biotype_id], + consequence_terms=c.consequence_term_ids.map( + lambda tid: hl.array(REGULATORY_CONSEQUENCE_TERMS)[tid], + ), + ).drop('biotype_id', 'consequence_term_ids'), + ), + ) + ht = ht.annotate_globals( + enums=ht.enums.drop('sorted_regulatory_feature_consequences'), + ) + if 'sorted_transcript_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_transcript_consequences=ht.sorted_transcript_consequences.map( + lambda c: c.annotate( + biotype=hl.array(BIOTYPES)[c.biotype_id], + consequence_terms=c.consequence_term_ids.map( + lambda tid: hl.array(TRANSCRIPT_CONSEQUENCE_TERMS)[tid], + ), + **{ + 'loftee': c.loftee.annotate( + lof_filters=c.loftee.lof_filter_ids.map( + lambda fid: hl.array(LOF_FILTERS)[fid], + ), + ).drop('lof_filter_ids'), + 'utrannotator': c.utrannotator.annotate( + fiveutr_consequence=hl.array(FIVEUTR_CONSEQUENCES)[ + c.utrannotator.fiveutr_consequence_id + ], + ).drop('fiveutr_consequence_id'), + } + if reference_genome == ReferenceGenome.GRCh38 + and dataset_type == DatasetType.SNV_INDEL + else { + 'lof_filters': c.lof_filter_ids.map( + lambda fid: hl.array(LOF_FILTERS)[fid], + ), + }, + ).drop( + 'biotype_id', + 'consequence_term_ids', + *( + [] + if reference_genome == ReferenceGenome.GRCh38 + and dataset_type == DatasetType.SNV_INDEL + else [ + 'lof_filter_ids', + ] + ), + ), + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('sorted_transcript_consequences')) + if 'mitotip' in formatting_annotation_names: + ht = ht.annotate( + mitotip=hl.Struct( + trna_prediction=hl.array(MITOTIP_PATHOGENICITIES)[ + ht.mitotip.trna_prediction_id + ], + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('mitotip')) + if 'sv_type_id' in formatting_annotation_names: + ht = ht.annotate(sv_type=hl.array(SV_TYPES)[ht.sv_type_id]).drop('sv_type_id') + ht = ht.annotate_globals(enums=ht.enums.drop('sv_type')) + if 'sv_type_detail_id' in formatting_annotation_names: + ht = ht.annotate( + sv_type_detail=hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id], + ).drop('sv_type_detail_id') + ht = ht.annotate_globals(enums=ht.enums.drop('sv_type_detail')) + if 'sorted_gene_consequences' in formatting_annotation_names: + ht = ht.annotate( + sorted_gene_consequences=ht.sorted_gene_consequences.map( + lambda c: c.annotate( + major_consequence=hl.array(SV_CONSEQUENCE_RANKS)[ + c.major_consequence_id + ], + ).drop('major_consequence_id'), + ), + ) + ht = ht.annotate_globals(enums=ht.enums.drop('sorted_gene_consequences')) + return ht diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index 0290433a6..3175ed5de 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -2,22 +2,499 @@ import hail as hl -from v03_pipeline.lib.annotations.misc import ( - unmap_formatting_annotation_enums, - unmap_reference_dataset_annotation_enums, -) from v03_pipeline.lib.model import ( DatasetType, ReferenceGenome, ) -from v03_pipeline.lib.tasks.exports.misc import camelcase_array_structexpression_fields +from v03_pipeline.lib.tasks.exports.misc import ( + camelcase_array_structexpression_fields, + unmap_formatting_annotation_enums, + unmap_reference_dataset_annotation_enums, +) TEST_SNV_INDEL_ANNOTATIONS = ( 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' ) - class MiscTest(unittest.TestCase): + def test_unmap_formatting_annotation_enums(self) -> None: + ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) + ht = unmap_formatting_annotation_enums( + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ) + self.assertListEqual( + list(ht.globals.enums.collect()[0].keys()), + [ + 'screen', + 'dbnsfp', + 'clinvar', + 'gnomad_exomes', + 'gnomad_non_coding_constraint', + 'splice_ai', + 'exac', + 'topmed', + 'hgmd', + 'gnomad_genomes', + 'eigen', + ], + ) + self.assertEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=939121, + reference_genome='GRCh38', + ), + alleles=['C', 'T'], + rg37_locus=hl.Locus( + contig=1, + position=874501, + reference_genome='GRCh37', + ), + rsid=None, + sorted_transcript_consequences=[ + hl.Struct( + amino_acids='S/L', + canonical=1, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000616016.5:c.1049C>T', + hgvsp='ENSP00000478421.2:p.Ser350Leu', + transcript_id='ENST00000616016', + mane_select='NM_001385641.1', + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_001385641.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000341065.8:c.284C>T', + hgvsp='ENSP00000349216.4:p.Ser95Leu', + transcript_id='ENST00000341065', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=4, total=12), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000342066.8:c.512C>T', + hgvsp='ENSP00000342313.3:p.Ser171Leu', + transcript_id='ENST00000342066', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_152486.4', + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000616125.5:c.512C>T', + hgvsp='ENSP00000484643.1:p.Ser171Leu', + transcript_id='ENST00000616125', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=11), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000617307.5:c.512C>T', + hgvsp='ENSP00000482090.2:p.Ser171Leu', + transcript_id='ENST00000617307', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618181.5:c.461C>T', + hgvsp='ENSP00000480870.1:p.Ser154Leu', + transcript_id='ENST00000618181', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=4, total=10), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618323.5:c.1049C>T', + hgvsp='ENSP00000480678.2:p.Ser350Leu', + transcript_id='ENST00000618323', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=6, total=14), + intron=None, + refseq_transcript_id='NM_001385640.1', + alphamissense=hl.Struct(pathogenicity=None), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000618779.5:c.512C>T', + hgvsp='ENSP00000484256.1:p.Ser171Leu', + transcript_id='ENST00000618779', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=12), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + hl.Struct( + amino_acids='S/L', + canonical=None, + codons='tCg/tTg', + gene_id='ENSG00000187634', + hgvsc='ENST00000622503.5:c.512C>T', + hgvsp='ENSP00000482138.1:p.Ser171Leu', + transcript_id='ENST00000622503', + mane_select=None, + mane_plus_clinical=None, + exon=hl.Struct(index=5, total=13), + intron=None, + refseq_transcript_id=None, + alphamissense=hl.Struct(pathogenicity=0.1467999964952469), + loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), + spliceregion=hl.Struct( + extended_intronic_splice_region_variant=False, + ), + utrannotator=hl.Struct( + existing_inframe_oorfs=None, + existing_outofframe_oorfs=None, + existing_uorfs=None, + fiveutr_annotation=None, + fiveutr_consequence=None, + ), + biotype='protein_coding', + consequence_terms=['missense_variant'], + ), + ], + variant_id='1-939121-C-T', + xpos=1000939121, + gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), + CAID='CA502654', + check_ref=False, + sorted_regulatory_feature_consequences=[ + hl.Struct( + regulatory_feature_id='ENSR00000344437', + biotype='CTCF_binding_site', + consequence_terms=['regulatory_region_variant'], + ), + ], + sorted_motif_feature_consequences=[ + hl.Struct( + motif_feature_id='ENSM00493959715', + consequence_terms=['TF_binding_site_variant'], + ), + ], + gnomad_non_coding_constraint=hl.Struct(z_score=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.0006690866430290043, + AN=1440770, + AC=964, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0008023773552849889, + FAF_AF=0.000633420015219599, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=0.0002759889466688037, + AN=152180, + AC=42, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005293028079904616, + FAF_AF=0.0002092500071739778, + Hemi=0, + ), + screen=hl.Struct(region_type_ids=[]), + dbnsfp=hl.Struct( + PrimateAI_score=0.5918066501617432, + fathmm_MKL_coding_score=0.7174800038337708, + CADD_phred=23.5, + SIFT_score=0.0010000000474974513, + REVEL_score=0.3109999895095825, + Polyphen2_HVAR_score=0.164000004529953, + VEST4_score=0.39500001072883606, + MPC_score=0.01291007362306118, + MutPred_score=None, + MutationTaster_pred_id=0, + ), + topmed=hl.Struct( + AC=41, + AF=0.00032651599030941725, + AN=125568, + Hom=0, + Het=41, + ), + exac=hl.Struct( + AF_POPMAX=0.0007150234305299819, + AF=0.00019039999460801482, + AC_Adj=20, + AC_Het=20, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=47974, + ), + splice_ai=hl.Struct(delta_score=0.0, splice_consequence_id=4), + eigen=hl.Struct(Eigen_phred=2.628000020980835), + clinvar=hl.Struct( + alleleId=929885, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], + conditions=['not provided'], + assertion_ids=[], + pathogenicity_id=12, + ), + ), + ) + + def test_unmap_reference_dataset_annotation_enums(self) -> None: + ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) + ht = unmap_reference_dataset_annotation_enums( + ht, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ) + self.assertListEqual( + list(ht.globals.enums.collect()[0].keys()), + [ + 'sorted_motif_feature_consequences', + 'sorted_regulatory_feature_consequences', + 'sorted_transcript_consequences', + ], + ) + self.assertEqual( + ht.drop( + 'sorted_transcript_consequences', + 'sorted_regulatory_feature_consequences', + 'sorted_motif_feature_consequences', + ).collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=939121, + reference_genome='GRCh38', + ), + alleles=['C', 'T'], + rg37_locus=hl.Locus( + contig=1, + position=874501, + reference_genome='GRCh37', + ), + rsid=None, + variant_id='1-939121-C-T', + xpos=1000939121, + gt_stats=hl.Struct(AC=47, AN=81784, AF=0.0005746845272369683, hom=1), + CAID='CA502654', + check_ref=False, + gnomad_non_coding_constraint=hl.Struct(z_score=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.0006690866430290043, + AN=1440770, + AC=964, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0008023773552849889, + FAF_AF=0.000633420015219599, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=0.0002759889466688037, + AN=152180, + AC=42, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005293028079904616, + FAF_AF=0.0002092500071739778, + Hemi=0, + ), + screen=hl.Struct(region_types=[]), + dbnsfp=hl.Struct( + PrimateAI_score=0.5918066501617432, + fathmm_MKL_coding_score=0.7174800038337708, + CADD_phred=23.5, + SIFT_score=0.0010000000474974513, + REVEL_score=0.3109999895095825, + Polyphen2_HVAR_score=0.164000004529953, + VEST4_score=0.39500001072883606, + MPC_score=0.01291007362306118, + MutPred_score=None, + MutationTaster_pred='D', + ), + topmed=hl.Struct( + AC=41, + AF=0.00032651599030941725, + AN=125568, + Hom=0, + Het=41, + ), + exac=hl.Struct( + AF_POPMAX=0.0007150234305299819, + AF=0.00019039999460801482, + AC_Adj=20, + AC_Het=20, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=47974, + ), + splice_ai=hl.Struct( + delta_score=0.0, + splice_consequence='No consequence', + ), + eigen=hl.Struct(Eigen_phred=2.628000020980835), + clinvar=hl.Struct( + alleleId=929885, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Labcorp Genetics (formerly Invitae), Labcorp'], + conditions=['not provided'], + assertions=[], + pathogenicity='Uncertain_significance', + ), + ), + ) + def test_camelcase_array_structexpression_fields(self) -> None: ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) ht = unmap_formatting_annotation_enums( From f2165cee437568d346311894a323350b199c7b4f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Sun, 20 Apr 2025 13:16:07 -0400 Subject: [PATCH 16/55] mostly functioning entries task --- v03_pipeline/lib/model/dataset_type.py | 32 ++++ v03_pipeline/lib/paths.py | 45 ++++++ .../lib/tasks/base/base_write_parquet.py | 15 ++ v03_pipeline/lib/tasks/exports/misc.py | 58 +++++++- v03_pipeline/lib/tasks/exports/misc_test.py | 1 + .../exports/write_new_entries_parquet.py | 57 +++++++ .../exports/write_new_transcripts_parquet.py | 69 +++++++++ .../write_new_transcripts_parquet_test.py | 83 +++++++++++ .../exports/write_new_variants_parquet.py | 140 ++++++++++++++++++ .../write_new_variants_parquet_test.py | 55 +++++++ 10 files changed, 548 insertions(+), 7 deletions(-) create mode 100644 v03_pipeline/lib/tasks/base/base_write_parquet.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index f19cb7432..6ae6aed55 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -1,3 +1,4 @@ +from collections import OrderedDict from collections.abc import Callable from enum import StrEnum @@ -391,6 +392,37 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: ], }[self] + def export_parquet_filterable_transcripts_fields( + self, + reference_genome: ReferenceGenome, + ) -> OrderedDict[str, str]: + fields = ['geneId'] + if self in {DatasetType.SV, DatasetType.GCNV}: + fields = [ + *fields, + 'majorConsequence', + ] + if self in {DatasetType.SNV_INDEL, DatasetType.MITO}: + fields = [ + *fields, + 'canonical', + 'consequenceTerms', + ] + fields = { + # above fields are renamed to themselves + k: k + for k in fields + } + if self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38: + fields = { + **fields, + 'alphamissense': 'alphamissense.pathogenicity', + 'extended_intronic_splice_region_variant': 'spliceregion.extended_intronic_splice_region_variant', + 'fiveutrConsequence': 'utrannotator.fiveutrConsequence', + } + # Parquet export expects all fields sorted alphabetically + return OrderedDict(sorted(fields.items())) + @property def overwrite_male_non_par_calls(self) -> None: return self == DatasetType.SV diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 094353f9c..84cfe3cbb 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -363,6 +363,51 @@ def variant_annotations_vcf_path( ) +def new_entries_parquet_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + run_id: str, +) -> str: + return os.path.join( + runs_path( + reference_genome, + dataset_type, + ), + run_id, + 'new_entries.parquet', + ) + + +def new_transcripts_parquet_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + run_id: str, +) -> str: + return os.path.join( + runs_path( + reference_genome, + dataset_type, + ), + run_id, + 'new_transcripts.parquet', + ) + + +def new_variants_parquet_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + run_id: str, +) -> str: + return os.path.join( + runs_path( + reference_genome, + dataset_type, + ), + run_id, + 'new_variants.parquet', + ) + + def new_variants_table_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/base/base_write_parquet.py b/v03_pipeline/lib/tasks/base/base_write_parquet.py new file mode 100644 index 000000000..f23a33219 --- /dev/null +++ b/v03_pipeline/lib/tasks/base/base_write_parquet.py @@ -0,0 +1,15 @@ +from v03_pipeline.lib.misc.io import checkpoint + + +class BaseWriteParquetTask(BaseHailTableTask): + def complete(self) -> luigi.Target: + return GCSorLocalFolderTarget(self.output().path).exists() + + def run(self) -> None: + ht = self.create_table() + ht, _ = checkpoint(ht) + df = ht.to_spark() + df.write.parquet( + self.output().path, + mode='overwrite', + ) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 171cd98b8..119ba24e1 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -13,7 +13,8 @@ SV_TYPES, TRANSCRIPT_CONSEQUENCE_TERMS, ) -from v03_pipeline.lib.model.definitions import DatasetType, ReferenceGenome +from v03_pipeline.lib.misc.nested_field import parse_nested_field +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset @@ -26,16 +27,59 @@ def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: return s.rename({f: snake_to_camelcase(f) for f in s}) +def array_structexpression_fields(ht: hl.Table): + return [ + field + for field in ht.row + if isinstance( + ht[field], + hl.expr.expressions.typed_expressions.ArrayStructExpression, + ) + ] + + +def transcripts_field_name( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> str: + formatting_annotation_names = { + fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome) + } + if 'sorted_gene_consequences' in formatting_annotation_names: + return snake_to_camelcase('sorted_gene_consequences') + return snake_to_camelcase('sorted_transcript_consequences') + + +def subset_filterable_transcripts_fields( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> hl.Table: + field_name = transcripts_field_name(reference_genome, dataset_type) + return ht.annotate( + **{ + field_name: ht[field_name].map( + lambda c: c.select( + **{ + new_nested_field_name: parse_nested_field( + ht[field_name], + existing_nested_field_name, + ) + for new_nested_field_name, existing_nested_field_name in dataset_type.export_parquet_filterable_transcripts_fields( + reference_genome, + ).items() + }, + ), + ), + }, + ) + + def camelcase_array_structexpression_fields( ht: hl.Table, reference_genome: ReferenceGenome, ): - for field in ht.row: - if not isinstance( - ht[field], - hl.expr.expressions.typed_expressions.ArrayStructExpression, - ): - continue + for field in array_structexpression_fields(ht): ht = ht.transmute( **{ snake_to_camelcase(field): ht[field].map( diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index 3175ed5de..e70e59e06 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -16,6 +16,7 @@ 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' ) + class MiscTest(unittest.TestCase): def test_unmap_formatting_annotation_enums(self) -> None: ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py new file mode 100644 index 000000000..f1aac53b1 --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -0,0 +1,57 @@ +import luigi +import luigi.util + +from v03_pipeline.lib.paths import ( + new_entries_parquet_path, +) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_query import ( + UpdatedReferenceDatasetQueryTask, +) +from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( + UpdateNewVariantsWithCAIDsTask, +) +from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask +from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( + WriteRemappedAndSubsettedCallsetTask, +) + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteNewEntriesParquetTask(BaseWriteParquetTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + new_entries_parquet_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + + def requires(self) -> list[luigi.Task]: + return { + 'annotations': ( + self.clone(UpdateNewVariantsWithCAIDsTask) + if self.dataset_type.should_send_to_allele_registry + else self.clone(WriteNewVariantsTableTask) + ), + 'high_af_variants': self.clone( + UpdatedReferenceDatasetQueryTask, + reference_dataset_query=ReferenceDatasetQuery.high_af_variants, + ), + 'remapped_and_subsetted_callsets': [ + self.clone( + WriteRemappedAndSubsettedCallsetTask, + project_i=i, + ) + for i in range(len(self.project_guids)) + ], + } + + def run(self) -> None: + pass diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py new file mode 100644 index 000000000..edb57b992 --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -0,0 +1,69 @@ +import hail as hl +import luigi +import luigi.util + +from v03_pipeline.lib.paths import ( + new_transcripts_parquet_path, + new_variants_table_path, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask +from v03_pipeline.lib.tasks.exports.misc import ( + camelcase_array_structexpression_fields, + transcripts_field_name, + unmap_formatting_annotation_enums, +) +from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget, GCSorLocalTarget +from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( + UpdateNewVariantsWithCAIDsTask, +) +from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteNewTranscriptsParquetTask(BaseWriteParquetTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + new_transcripts_parquet_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + + def complete(self) -> luigi.Target: + return GCSorLocalFolderTarget(self.output().path).exists() + + def requires(self) -> list[luigi.Task]: + return [ + self.clone(UpdateNewVariantsWithCAIDsTask) + if self.dataset_type.should_send_to_allele_registry + else self.clone(WriteNewVariantsTableTask), + ] + + def run(self) -> None: + ht = hl.read_table( + new_variants_table_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + ht = unmap_formatting_annotation_enums( + ht, + self.reference_genome, + self.dataset_type, + ) + ht = camelcase_array_structexpression_fields(ht, self.reference_genome) + ht = ht.key_by() + return ht.select( + transcripts=ht[ + transcripts_field_name(self.reference_genome, self.dataset_type) + ] + .group_by( + lambda c: c.geneId, + ) + .items(), + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py new file mode 100644 index 000000000..89b346d6b --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -0,0 +1,83 @@ +import os + +import hail as hl +import luigi.worker +import pandas as pd + +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, + SampleType, +) +from v03_pipeline.lib.paths import new_transcripts_parquet_path, new_variants_table_path +from v03_pipeline.lib.tasks.exports.write_new_transcripts_parquet import ( + WriteNewTranscriptsParquetTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) + +TEST_RUN_ID = 'manual__2024-04-03' + + +class WriteNewTranscriptsParquetTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + ht = hl.read_table( + TEST_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + new_variants_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ) + + # Make an incomplete parquet to validate overwrite-ing. + os.makedirs( + new_transcripts_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + exist_ok=True, + ) + with open( + os.path.join( + new_transcripts_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + 'incomplete_file.parquet', + ), + 'w', + ) as f: + f.write('') + + def test_write_new_transcripts_parquet_test( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteNewTranscriptsParquetTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='fake_callset', + project_guids=[ + 'fake_project', + ], + project_pedigree_paths=['fake_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + + +pd.read_parquet('example_pa.parquet', engine='pyarrow') diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py new file mode 100644 index 000000000..4b88a9445 --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -0,0 +1,140 @@ +import hail as hl +import luigi +import luigi.util + +from v03_pipeline.lib.model import FeatureFlag +from v03_pipeline.lib.paths import ( + new_variants_parquet_path, + new_variants_table_path, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask +from v03_pipeline.lib.tasks.exports.misc import ( + array_structexpression_fields, + camelcase_array_structexpression_fields, + subset_filterable_transcripts_fields, + unmap_formatting_annotation_enums, + unmap_reference_dataset_annotation_enums, +) +from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget, GCSorLocalTarget +from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( + UpdateNewVariantsWithCAIDsTask, +) +from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteNewVariantsParquetTask(BaseWriteParquetTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + new_variants_parquet_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + + def complete(self) -> luigi.Target: + return GCSorLocalFolderTarget(self.output().path).exists() + + def requires(self) -> list[luigi.Task]: + return [ + self.clone(UpdateNewVariantsWithCAIDsTask) + if self.dataset_type.should_send_to_allele_registry + else self.clone(WriteNewVariantsTableTask), + ] + + def run(self) -> None: + ht = hl.read_table( + new_variants_table_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + ht = unmap_formatting_annotation_enums( + ht, + self.reference_genome, + self.dataset_type, + ) + ht = unmap_reference_dataset_annotation_enums( + ht, + self.reference_genome, + self.dataset_type, + ) + ht = camelcase_array_structexpression_fields(ht, self.reference_genome) + ht = subset_filterable_transcripts_fields( + ht, + self.reference_genome, + self.dataset_type, + ) + ht = ht.key_by() + return ht.select( + xpos=ht.xpos, + chrom=ht.locus.contig, + pos=ht.locus.position, + ref=ht.alleles[0], + alt=ht.alleles[1], + variantId=ht.variant_id, + rsid=ht.rsid, + CAID=ht.CAID, + liftedOverChrom=ht.rg37_locus.contig, + liftedOverPos=ht.rg37_locus.position, + screenRegionType=ht.screen.region_types.first(), + predictions=hl.Struct( + cadd=ht.dbnsfp.CADD_phred, + eigen=ht.eigen.Eigen_phred, + fathmm=ht.dbnsfp.fathmm_MKL_coding_score, + gnomad_noncoding=ht.gnomad_non_coding_constraint.z_score, + mpc=ht.dbnsfp.MPC_score, + mut_pred=ht.dbnsfp.MutPred_score, + mut_tester=ht.dbnsfp.MutationTaster_pred, + polyphen=ht.dbnsfp.Polyphen2_HVAR_score, + primate_ai=ht.dbnsfp.PrimateAI_score, + revel=ht.dbnsfp.REVEL_score, + sift=ht.dbnsfp.SIFT_score, + splice_ai=ht.splice_ai.delta_score, + splice_ai_consequence=ht.splice_ai.splice_consequence, + vest=ht.dbnsfp.VEST4_score, + ), + populations=hl.Struct( + exac=hl.Struct( + ac=ht.exac.AC_Adj, + af=ht.exac.AF, + an=ht.exac.AN_Adj, + filter_af=ht.exac.AF_POPMAX, + hemi=ht.exac.AC_Hemi, + het=ht.exac.AC_Het, + hom=ht.exac.AC_Hom, + ), + gnomad_exomes=hl.Struct( + ac=ht.gnomad_exomes.AC, + af=ht.gnomad_exomes.AF, + an=ht.gnomad_exomes.AN, + filter_af=ht.gnomad_exomes.AF_POPMAX_OR_GLOBAL, + hemi=ht.gnomad_exomes.Hemi, + hom=ht.gnomad_exomes.Hom, + ), + gnomad_genomes=hl.Struct( + ac=ht.gnomad_genomes.AC, + af=ht.gnomad_genomes.AF, + an=ht.gnomad_genomes.AN, + filter_af=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL, + hemi=ht.gnomad_genomes.Hemi, + hom=ht.gnomad_genomes.Hom, + ), + topmed=hl.Struct( + ac=ht.topmed.AC, + af=ht.topmed.AF, + an=ht.topmed.AN, + het=ht.topmed.Het, + hom=ht.topmed.Hom, + ), + ), + **{f: ht[f] for f in array_structexpression_fields(ht)}, + **{'hgmd': ht.hgmd} + if FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS + else {}, + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py new file mode 100644 index 000000000..2503851af --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -0,0 +1,55 @@ +import hail as hl +import luigi.worker + +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, + SampleType, +) +from v03_pipeline.lib.paths import new_variants_table_path +from v03_pipeline.lib.tasks.exports.write_new_variants_parquet import ( + WriteNewVariantsParquetTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) + +TEST_RUN_ID = 'manual__2024-04-03' + + +class WriteNewVariantsParquetTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + ht = hl.read_table( + TEST_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + new_variants_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ) + + def test_write_new_variants_parquet_test( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteNewVariantsParquetTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='fake_callset', + project_guids=[ + 'fake_project', + ], + project_pedigree_paths=['fake_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) From 488019542ffb412b8ec3813f90ab7d78f9c51631 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Sun, 20 Apr 2025 13:35:48 -0400 Subject: [PATCH 17/55] import --- v03_pipeline/lib/tasks/base/base_write_parquet.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/base/base_write_parquet.py b/v03_pipeline/lib/tasks/base/base_write_parquet.py index f23a33219..6dbf15a4b 100644 --- a/v03_pipeline/lib/tasks/base/base_write_parquet.py +++ b/v03_pipeline/lib/tasks/base/base_write_parquet.py @@ -1,7 +1,10 @@ +import luigi + from v03_pipeline.lib.misc.io import checkpoint +from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget -class BaseWriteParquetTask(BaseHailTableTask): +class BaseWriteParquetTask(luigi.task): def complete(self) -> luigi.Target: return GCSorLocalFolderTarget(self.output().path).exists() From f28ec6a950dd270973bbc07a2654f591b5f4d717 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Sun, 20 Apr 2025 17:35:23 -0400 Subject: [PATCH 18/55] improve private reference datasets logic --- .../reference_datasets/reference_dataset.py | 15 + v03_pipeline/lib/tasks/exports/misc.py | 18 +- v03_pipeline/lib/tasks/exports/misc_test.py | 454 +----------------- .../exports/write_new_variants_parquet.py | 9 +- 4 files changed, 37 insertions(+), 459 deletions(-) diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py index 8ed022cfb..f56a8cc88 100644 --- a/v03_pipeline/lib/reference_datasets/reference_dataset.py +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -56,6 +56,21 @@ def for_reference_genome_dataset_type( } return set(reference_datasets) + @classmethod + def for_reference_genome_dataset_type_private( + cls, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + ) -> set[Union['ReferenceDataset']]: + return { + dataset + for dataset in cls.for_reference_genome_dataset_type( + reference_genome, + dataset_type, + ) + if self.access_control == AccessControl.PRIVATE + } + @classmethod def for_reference_genome_dataset_type_annotations( cls, diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 119ba24e1..4cb4da9e8 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -28,14 +28,16 @@ def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: def array_structexpression_fields(ht: hl.Table): - return [ - field - for field in ht.row - if isinstance( - ht[field], - hl.expr.expressions.typed_expressions.ArrayStructExpression, - ) - ] + return sorted( + [ + field + for field in ht.row + if isinstance( + ht[field], + hl.expr.expressions.typed_expressions.ArrayStructExpression, + ) + ] + ) def transcripts_field_name( diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index e70e59e06..0b719fcad 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -41,6 +41,9 @@ def test_unmap_formatting_annotation_enums(self) -> None: 'eigen', ], ) + ht = ht.annotate( + sorted_transcript_consequences=[ht.sorted_transcript_consequences[0]], + ) self.assertEqual( ht.collect()[0], hl.Struct( @@ -85,230 +88,6 @@ def test_unmap_formatting_annotation_enums(self) -> None: biotype='protein_coding', consequence_terms=['missense_variant'], ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000341065.8:c.284C>T', - hgvsp='ENSP00000349216.4:p.Ser95Leu', - transcript_id='ENST00000341065', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=4, total=12), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000342066.8:c.512C>T', - hgvsp='ENSP00000342313.3:p.Ser171Leu', - transcript_id='ENST00000342066', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseq_transcript_id='NM_152486.4', - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000616125.5:c.512C>T', - hgvsp='ENSP00000484643.1:p.Ser171Leu', - transcript_id='ENST00000616125', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=11), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000617307.5:c.512C>T', - hgvsp='ENSP00000482090.2:p.Ser171Leu', - transcript_id='ENST00000617307', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618181.5:c.461C>T', - hgvsp='ENSP00000480870.1:p.Ser154Leu', - transcript_id='ENST00000618181', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=4, total=10), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618323.5:c.1049C>T', - hgvsp='ENSP00000480678.2:p.Ser350Leu', - transcript_id='ENST00000618323', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseq_transcript_id='NM_001385640.1', - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000618779.5:c.512C>T', - hgvsp='ENSP00000484256.1:p.Ser171Leu', - transcript_id='ENST00000618779', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=12), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), - hl.Struct( - amino_acids='S/L', - canonical=None, - codons='tCg/tTg', - gene_id='ENSG00000187634', - hgvsc='ENST00000622503.5:c.512C>T', - hgvsp='ENSP00000482138.1:p.Ser171Leu', - transcript_id='ENST00000622503', - mane_select=None, - mane_plus_clinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseq_transcript_id=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(is_lof_nagnag=None, lof_filters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existing_inframe_oorfs=None, - existing_outofframe_oorfs=None, - existing_uorfs=None, - fiveutr_annotation=None, - fiveutr_consequence=None, - ), - biotype='protein_coding', - consequence_terms=['missense_variant'], - ), ], variant_id='1-939121-C-T', xpos=1000939121, @@ -509,6 +288,9 @@ def test_camelcase_array_structexpression_fields(self) -> None: DatasetType.SNV_INDEL, ) ht = camelcase_array_structexpression_fields(ht, ReferenceGenome.GRCh38) + ht = ht.annotate( + sortedTranscriptConsequences=[ht.sortedTranscriptConsequences[0]], + ) self.assertEqual( ht.collect()[0], hl.Struct( @@ -621,230 +403,6 @@ def test_camelcase_array_structexpression_fields(self) -> None: biotype='protein_coding', consequenceTerms=['missense_variant'], ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000341065.8:c.284C>T', - hgvsp='ENSP00000349216.4:p.Ser95Leu', - transcriptId='ENST00000341065', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=4, total=12), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000342066.8:c.512C>T', - hgvsp='ENSP00000342313.3:p.Ser171Leu', - transcriptId='ENST00000342066', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseqTranscriptId='NM_152486.4', - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000616125.5:c.512C>T', - hgvsp='ENSP00000484643.1:p.Ser171Leu', - transcriptId='ENST00000616125', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=5, total=11), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000617307.5:c.512C>T', - hgvsp='ENSP00000482090.2:p.Ser171Leu', - transcriptId='ENST00000617307', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000618181.5:c.461C>T', - hgvsp='ENSP00000480870.1:p.Ser154Leu', - transcriptId='ENST00000618181', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=4, total=10), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000618323.5:c.1049C>T', - hgvsp='ENSP00000480678.2:p.Ser350Leu', - transcriptId='ENST00000618323', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=6, total=14), - intron=None, - refseqTranscriptId='NM_001385640.1', - alphamissense=hl.Struct(pathogenicity=None), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000618779.5:c.512C>T', - hgvsp='ENSP00000484256.1:p.Ser171Leu', - transcriptId='ENST00000618779', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=5, total=12), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), - hl.Struct( - aminoAcids='S/L', - canonical=None, - codons='tCg/tTg', - geneId='ENSG00000187634', - hgvsc='ENST00000622503.5:c.512C>T', - hgvsp='ENSP00000482138.1:p.Ser171Leu', - transcriptId='ENST00000622503', - maneSelect=None, - manePlusClinical=None, - exon=hl.Struct(index=5, total=13), - intron=None, - refseqTranscriptId=None, - alphamissense=hl.Struct(pathogenicity=0.1467999964952469), - loftee=hl.Struct(isLofNagnag=None, lofFilters=None), - spliceregion=hl.Struct( - extended_intronic_splice_region_variant=False, - ), - utrannotator=hl.Struct( - existingInframeOorfs=None, - existingOutofframeOorfs=None, - existingUorfs=None, - fiveutrAnnotation=None, - fiveutrConsequence=None, - ), - biotype='protein_coding', - consequenceTerms=['missense_variant'], - ), ], sortedRegulatoryFeatureConsequences=[ hl.Struct( diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 4b88a9445..10fe94901 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -134,7 +134,10 @@ def run(self) -> None: ), ), **{f: ht[f] for f in array_structexpression_fields(ht)}, - **{'hgmd': ht.hgmd} - if FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS - else {}, + **{ + rd: ht[rd] + for rd in BaseReferenceDataset.for_reference_genome_dataset_type( + self.reference_genome, self.dataset_type + ) + }, ) From 359d7735a42e9f2cc93bee26ae3bf104b46949a0 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Sun, 20 Apr 2025 18:54:34 -0400 Subject: [PATCH 19/55] first pass --- .../base_update_variant_annotations_table.py | 1 + .../lib/tasks/write_new_variants_table.py | 6 +++++ .../annotations/0007_add_key_field.py | 23 +++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 v03_pipeline/migrations/annotations/0007_add_key_field.py diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 29c374e48..036a73bd3 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -69,6 +69,7 @@ def initialize_table(self) -> hl.Table: ), ), migrations=hl.empty_array(hl.tstr), + max_seen_id=hl.tint64, ), ) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 1177d66ec..cb7904596 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -192,6 +192,12 @@ def create_table(self) -> hl.Table: }, ) new_variants_ht = new_variants_ht.join(reference_dataset_ht, 'left') + + # Add serial integer index + new_variants_ht = new_variants_ht.add_index(name='key_') + new_variants_ht = new_variants_ht.transmute( + key_=new_variants_ht.key_ + annotations_ht.index_globals().max_seen_id, + ) return new_variants_ht.select_globals( updates={ hl.Struct( diff --git a/v03_pipeline/migrations/annotations/0007_add_key_field.py b/v03_pipeline/migrations/annotations/0007_add_key_field.py new file mode 100644 index 000000000..592f8c74a --- /dev/null +++ b/v03_pipeline/migrations/annotations/0007_add_key_field.py @@ -0,0 +1,23 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddKeyField(BaseMigration): + reference_genome_dataset_types: frozenset[tuple[ReferenceGenome, DatasetType]] = ( + frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + (ReferenceGenome.GRCh38, DatasetType.GCNV), + (ReferenceGenome.GRCh38, DatasetType.SV), + ), + ) + ) + + @staticmethod + def migrate(ht: hl.Table, **_) -> hl.Table: + ht = ht.add_index(name='key_') + return ht.annotate_globals(max_seen_id=(ht.count() - 1)) From 2d4420f83984d3e5de443a737b0fe9d584038beb Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Sun, 20 Apr 2025 19:13:50 -0400 Subject: [PATCH 20/55] progress --- .../lib/tasks/base/base_update_variant_annotations_table.py | 2 +- .../lib/tasks/migrate_variant_annotations_table_test.py | 1 + .../tasks/update_variant_annotations_table_with_new_samples.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 036a73bd3..4e4e15bb7 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -69,7 +69,7 @@ def initialize_table(self) -> hl.Table: ), ), migrations=hl.empty_array(hl.tstr), - max_seen_id=hl.tint64, + max_key_=hl.int64(0), ), ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py index 0438c9fbc..b131602db 100644 --- a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py @@ -69,6 +69,7 @@ def test_mock_migration( enums=hl.Struct(), updates=set(), migrations=['0012_mock_migration'], + max_key_=0, mock_migration='a mock migration', ), ], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 2c58381d3..4e9b36a64 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -137,4 +137,5 @@ def update_table(self, ht: hl.Table) -> hl.Table: for i, project_guid in enumerate(self.project_guids) }, ), + max_key_=ht.aggregate(hl.agg.max(ht.key_)), ) From 3964c8fa4ceeeb3f6ad5edcb38d2ee94e825af2d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 21 Apr 2025 00:33:21 -0400 Subject: [PATCH 21/55] tests passing --- .../base_update_variant_annotations_table.py | 3 ++- .../migrate_variant_annotations_table.py | 1 + .../migrate_variant_annotations_table_test.py | 4 +++- ...annotations_table_with_new_samples_test.py | 22 +++++++++++++++++++ .../lib/tasks/write_new_variants_table.py | 2 +- .../annotations/0007_add_key_field.py | 2 +- 6 files changed, 30 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 4e4e15bb7..7a817bace 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -69,7 +69,7 @@ def initialize_table(self) -> hl.Table: ), ), migrations=hl.empty_array(hl.tstr), - max_key_=hl.int64(0), + max_key_=hl.int64(-1), ), ) @@ -101,5 +101,6 @@ def annotate_globals( ), updates=ht.globals.updates, migrations=ht.globals.migrations, + max_key_=ht.globals.max_key_, ) return annotate_enums(ht, self.reference_genome, self.dataset_type) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py index 44e3debde..ace64e1d9 100644 --- a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py @@ -40,5 +40,6 @@ def initialize_table(self) -> hl.Table: ), ), migrations=hl.empty_array(hl.tstr), + max_key_=hl.int64(-1), ), ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py index b131602db..392e847af 100644 --- a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py @@ -69,7 +69,7 @@ def test_mock_migration( enums=hl.Struct(), updates=set(), migrations=['0012_mock_migration'], - max_key_=0, + max_key_=-1, mock_migration='a mock migration', ), ], @@ -109,6 +109,7 @@ def test_migration_is_noop_for_other_dataset_types( enums=hl.Struct(), updates=set(), migrations=[], + max_key_=-1, ), ], ) @@ -144,6 +145,7 @@ def test_migration_dependency( enums=hl.Struct(), updates=set(), migrations=['0012_mock_migration', '0013_mock_migration2'], + max_key_=-1, mock_migration='a mock migration', mock_migration2='a second mock migration', ), diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 34755dfe5..385ffcdfa 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -335,6 +335,10 @@ def test_multiple_update_vat( }, ], ) + self.assertEqual( + hl.eval(ht.globals.max_key_), + 29, + ) # Ensure that new variants are added correctly to the table. uvatwns_task_4 = UpdateVariantAnnotationsTableWithNewSamplesTask( @@ -504,6 +508,7 @@ def test_multiple_update_vat( hgmd='1.0', ), migrations=[], + max_key_=29, enums=hl.Struct( clinvar=ReferenceDataset.clinvar.enum_globals, dbnsfp=ReferenceDataset.dbnsfp.enum_globals, @@ -691,6 +696,7 @@ def test_update_vat_grch37( hgmd=None, gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), CAID=None, + key_=0, ), ) @@ -838,6 +844,7 @@ def test_mito_update_vat( mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES), ), migrations=[], + max_key_=4, updates={ hl.Struct( callset='v03_pipeline/var/test/callsets/mito_1.mt', @@ -888,6 +895,7 @@ def test_mito_update_vat( AN=4, ), local_constraint_mito=None, + key_=0, ), ) @@ -936,6 +944,7 @@ def test_sv_multiple_vcf_update_vat( ), ), migrations=[], + max_key_=12, updates={ hl.Struct( callset=TEST_SV_VCF, @@ -995,6 +1004,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=2, sv_type_detail_id=None, xpos=1000180928, + key_=0, ), hl.Struct( variant_id='BND_chr1_9', @@ -1038,6 +1048,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=2, sv_type_detail_id=None, xpos=1000789481, + key_=1, ), hl.Struct( variant_id='CPX_chr1_22', @@ -1091,6 +1102,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=3, sv_type_detail_id=2, xpos=1006558902, + key_=2, ), hl.Struct( variant_id='CPX_chr1_251', @@ -1147,6 +1159,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=3, sv_type_detail_id=9, xpos=1180540234, + key_=3, ), hl.Struct( variant_id='CPX_chr1_41', @@ -1200,6 +1213,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=3, sv_type_detail_id=12, xpos=1016088760, + key_=4, ), hl.Struct( variant_id='CPX_chr1_54', @@ -1258,6 +1272,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=3, sv_type_detail_id=13, xpos=1021427498, + key_=5, ), hl.Struct( variant_id='CPX_chrX_251', @@ -1326,6 +1341,7 @@ def test_sv_multiple_vcf_update_vat( position=2699041, reference_genome='GRCh37', ), + key_=6, ), hl.Struct( variant_id='CPX_chrX_252', @@ -1398,6 +1414,7 @@ def test_sv_multiple_vcf_update_vat( position=2699941, reference_genome='GRCh37', ), + key_=7, ), ], ) @@ -1432,6 +1449,7 @@ def test_sv_multiple_vcf_update_vat( ), ), migrations=[], + max_key_=13, updates={ hl.Struct( callset=TEST_SV_VCF, @@ -1500,6 +1518,7 @@ def test_sv_multiple_vcf_update_vat( sv_type_id=2, sv_type_detail_id=None, xpos=1000180928, + key_=0, ), ], ) @@ -1582,6 +1601,7 @@ def test_gcnv_update_vat_multiple( ), ), migrations=[], + max_key_=1, updates={ hl.Struct( callset=TEST_GCNV_BED_FILE, @@ -1637,6 +1657,7 @@ def test_gcnv_update_vat_multiple( strvctvre=hl.Struct(score=hl.eval(hl.float32(0.583))), sv_type_id=5, xpos=1100006937, + key_=0, ), hl.Struct( variant_id='suffix_16457_DEL', @@ -1675,6 +1696,7 @@ def test_gcnv_update_vat_multiple( strvctvre=hl.Struct(score=0.5070000290870667), sv_type_id=5, xpos=1100017586, + key_=1, ), ], ) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index cb7904596..63ff75739 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -196,7 +196,7 @@ def create_table(self) -> hl.Table: # Add serial integer index new_variants_ht = new_variants_ht.add_index(name='key_') new_variants_ht = new_variants_ht.transmute( - key_=new_variants_ht.key_ + annotations_ht.index_globals().max_seen_id, + key_=new_variants_ht.key_ + annotations_ht.index_globals().max_key_ + 1, ) return new_variants_ht.select_globals( updates={ diff --git a/v03_pipeline/migrations/annotations/0007_add_key_field.py b/v03_pipeline/migrations/annotations/0007_add_key_field.py index 592f8c74a..198fc1766 100644 --- a/v03_pipeline/migrations/annotations/0007_add_key_field.py +++ b/v03_pipeline/migrations/annotations/0007_add_key_field.py @@ -20,4 +20,4 @@ class AddKeyField(BaseMigration): @staticmethod def migrate(ht: hl.Table, **_) -> hl.Table: ht = ht.add_index(name='key_') - return ht.annotate_globals(max_seen_id=(ht.count() - 1)) + return ht.annotate_globals(max_key_=(ht.count() - 1)) From 8b67aa7fe6998b7cfb22f3de392e99c294d83bb2 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 21 Apr 2025 15:35:59 -0400 Subject: [PATCH 22/55] key --- v03_pipeline/lib/tasks/base/base_write_parquet.py | 2 +- v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py | 1 + v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/base/base_write_parquet.py b/v03_pipeline/lib/tasks/base/base_write_parquet.py index 6dbf15a4b..0f5d7fc43 100644 --- a/v03_pipeline/lib/tasks/base/base_write_parquet.py +++ b/v03_pipeline/lib/tasks/base/base_write_parquet.py @@ -11,7 +11,7 @@ def complete(self) -> luigi.Target: def run(self) -> None: ht = self.create_table() ht, _ = checkpoint(ht) - df = ht.to_spark() + df = ht.to_spark(flatten=False) df.write.parquet( self.output().path, mode='overwrite', diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index edb57b992..932c50baa 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -59,6 +59,7 @@ def run(self) -> None: ht = camelcase_array_structexpression_fields(ht, self.reference_genome) ht = ht.key_by() return ht.select( + key_=ht.key_, transcripts=ht[ transcripts_field_name(self.reference_genome, self.dataset_type) ] diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 10fe94901..5d3b8eba8 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -72,6 +72,7 @@ def run(self) -> None: ) ht = ht.key_by() return ht.select( + key_=ht.key_, xpos=ht.xpos, chrom=ht.locus.contig, pos=ht.locus.position, From 752b123e76359a8a37ef4b2d03cbc933b1ac4af8 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 21 Apr 2025 16:05:32 -0400 Subject: [PATCH 23/55] annotations table --- .../lib/tasks/base/base_write_parquet.py | 2 +- .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 12 -> 12 bytes .../annotations.ht/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../SNV_INDEL/annotations.ht/README.txt | 2 +- .../globals/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../annotations.ht/globals/metadata.json.gz | Bin 751 -> 762 bytes .../annotations.ht/globals/parts/.part-0.crc | Bin 104 -> 104 bytes .../annotations.ht/globals/parts/part-0 | Bin 11895 -> 11896 bytes .../.index.crc | Bin 12 -> 0 bytes .../index | Bin 87 -> 0 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin .../index | Bin 0 -> 87 bytes .../metadata.json.gz | Bin .../SNV_INDEL/annotations.ht/metadata.json.gz | Bin 1315 -> 1326 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 24 -> 24 bytes .../annotations.ht/rows/metadata.json.gz | Bin 1815 -> 1820 bytes ...0-034376f0-4c6b-4bf0-8912-4035d651b982.crc | Bin 16 -> 0 bytes ...0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc | Bin 0 -> 16 bytes ...art-0-034376f0-4c6b-4bf0-8912-4035d651b982 | Bin 928 -> 0 bytes ...art-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 | Bin 0 -> 929 bytes 21 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.index.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/index create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.index.crc rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx => part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx}/.metadata.json.gz.crc (100%) create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/index rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx => part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx}/metadata.json.gz (100%) delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 diff --git a/v03_pipeline/lib/tasks/base/base_write_parquet.py b/v03_pipeline/lib/tasks/base/base_write_parquet.py index 0f5d7fc43..87c16e876 100644 --- a/v03_pipeline/lib/tasks/base/base_write_parquet.py +++ b/v03_pipeline/lib/tasks/base/base_write_parquet.py @@ -4,7 +4,7 @@ from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget -class BaseWriteParquetTask(luigi.task): +class BaseWriteParquetTask(luigi.Task): def complete(self) -> luigi.Target: return GCSorLocalFolderTarget(self.output().path).exists() diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc index 66f7dd3416ab4534150994a8aa38a75928280e64..bf6256713d879829918ffe9996699c1bdcd22c58 100644 GIT binary patch literal 12 TcmYc;N@ieSU}C7aInNFN6J-Nv literal 12 TcmYc;N@ieSU}A{8c|{!n6FviI diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc index 513ecd0d33551fff1d663663e60e663c44852f6d..04084645f54cd62cdbaa7618a71d2e940992ecf6 100644 GIT binary patch literal 20 bcmYc;N@ieSU}8v8&-985s5^dNbXfraHNOUQ literal 20 ccmYc;N@ieSU}E_HC9Y|q`G>`RFE6nJ080D_i2wiq diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt index 6e4fca836..76c4a0d1b 100644 --- a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt +++ b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2025/04/09 21:10:13 \ No newline at end of file + Created at 2025/04/21 16:05:23 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc index 2d0b45e3f15e98be14c524fe3ec0a037d86b89b8..a37c1cabbbabc15da64c3963c76039eb01c76393 100644 GIT binary patch literal 16 XcmYc;N@ieSU}Dg#H+bZACv*n@A^im{ literal 16 XcmYc;N@ieSU}A`x`Tfk)h`WCQDMSXi diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz index 35c57a2a0e54bd49120bba5d1c00c918e0833fee..ea9fe0803d8aae31bb1cfdac1635277ae32e0413 100644 GIT binary patch delta 669 zcmV;O0%HB|1^NY$J_GJM>5)M%e<%tZM4mT`IkT`t({S8vz*wo3okghS20%GXDrUB+ zSSg#=-sYuL8)iIEC|xm;!IQ4Qj$oygyQ5SE{V9uvNrgVj)}`c-G3gk5Ro z*F_~phZLaV!$B574Q&dlsZQ4EF#`oPBtvQzIX)W0ws(D!`!;BWr+jaae<6`>nME*J zZT6QPQG>2|F0&SlBaazr&2X)$2+1XGr!tP(?(?=UHb0DwMiBJp06QN1Zb0$Lmh{64 z6Z(;bXg@fQ@sb+wOliSX!$il~Mn!0YAhJrM%(EOAHYU3JAWPuMXl&KgExMHSPBbM$ zZMJ9RT}eIt@}#dgk;bIxf9;et{-=}DQs)Q|CnPM(Wl4;wt2ORYN~3E;%**8l$os;J z(o%|AFobB}!9ZA9S_p)pHu?cLWMr;3|S6^D8lD+F1zK>hM}R=>}k882fDPA_2>f8&?01NIBmJ$H-L zS^EWwQ}-LvbN8EZFnY}!)iTt%L!%BPuW`?)+kg;sBG$AhZlGs_9HaJ~p<)yIby~Rw z=k-mF^eR@WNTa5&8+1QZrdMdvDK#3Lccu~zlp*f_nP;5Ro`1;QtIb$v23LqQ<{5Q~ zX@RmV0q+KS(rrehe9aC%#@ly^`@DR( zVXA<+Zj^AYU1E`(l$J=K#Nk0IUN(ZHeD9(Gz!6)x7>Bej4;w=?)^XauNVu3V-3kBz Dm_k0( delta 658 zcmV;D0&V^J1@8rrJ_GdKosmH=f9MoAh&*o=b7o2|F0&SlBaazr&2X*h5|T^YPDLEG-N$WTY<_q)8bQ#X1MGP4y8*=~Thb3J z4CqG|qW$1J#!G6zGgk|y8U{MfHoAm12qLRA$~?<~VPm4&2T=k~Mq{g{s^~(}JJFO3 zwb`B#cO~`o%agt$BaKPXf7>Z*{7;$EQs)Q|2P7=ZWl4;w(i&Hk(&!oy^K!WX^1kq* zv=pKi3?UkLFc4Oj76M_YjeY=*S<%Q~-wjl(&Pd&4LED?HnO)BoUlm?nwD)O6;C>f< zj$?i&f}2hRhw>K-0$H>#{jPUbzrLLr?_LZ}?_77r?^*}!_osXAfA*%c_WKft?iZrx z?ib-;^b$9^%TU%1eL0Z4q&*{M147V=Si+)n1Nj=n7`5*V(VCE}(?T`K*Eczmr&yRG zNt!M<=zb_fuaKZq;xov1rtl1eAnyN}IZjEp1OS@daE&1icCX08Jb|%>V!Z diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc index ebf652b1cd0c25590abae55095e71472147f1221..cf52b87c09fe155a4804c1887689a0c76e97f508 100644 GIT binary patch delta 17 Zcmd1Em=MFW+v(%FO+nSu8N^-R002e}2g3jW delta 17 Zcmd1Em=MFWu2pX3-W88RSfjkB0surt2b}-_ diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/part-0 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/parts/part-0 index 0829aa95e130fa7a307b36652720db8610028783..cec183733229ce0b7492b4a817cc9d9ce7aac222 100644 GIT binary patch delta 40 wcmew!^CM=%Wi62j%nS@Zd<+a4TlN1Y`0$F#Ftokd{9J1qBO~MHTAg2v05fC`ZvX%Q delta 38 ucmewn^F3z6Wi63@W(EdtJ_ZJjt@?ixym@(L7|Pacey%l*adVx{FGc_s@D39I diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.index.crc deleted file mode 100644 index f20612e84c6d209d01c4fc5071e22dd2dbc03edc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}DIST0a8-5o7}d diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/index b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/index deleted file mode 100644 index e2870eee35c0d6e9236e8b159830c948da6438ee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 87 zcmdOAU|>)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJEP-2E(M?(VVD-- MnG6h!KwW5h01R0YQUCw| diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..812242553eab389da26ce5f3d0090e92c85b3cc2 GIT binary patch literal 12 TcmYc;N@ieSU}AVWDX0nn6X*ke literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/index b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..60c06ecd3401279de49c80fc184866f61b80c372 GIT binary patch literal 87 zcmdOAU|>)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJEP-&E(M?(VVD-- MnG6h!KwW5h01ST;Qvd(} literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-034376f0-4c6b-4bf0-8912-4035d651b982.idx/metadata.json.gz rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/metadata.json.gz index 481c84cd54442cedceb31a2278068c30694140f9..5c693f352b5f617552ca2d1f95446a02fb119b07 100644 GIT binary patch delta 1160 zcmV;31b6$R3a$!}9TWu?1RNEuxRVgFOfpHlICjo<*mjXI7y$;6N*{lD+&kW*lW3wK z1qC7oO%!laQj;TdheZ}cA5p#8lz^0#1zs{PA1R+MjjCev(iqVm*0BN*7Gffcl1HJ_ z(nB$4;+ZNBWHZZvY&k8He|E4?QGBp5R1wsTH9r=bXRMxnV#)jy*Q}zsqf1{XN6f6^Jtdn5w zeM%|k8rn81mHYv*Az8)R9#T-yl9Z6J410s*ROizRVGj1U4^e?Qcklf0pFH0{vC0mP$qxJRrwC_K>+(BA zYBjn>ESQjl#w=;)Lrk%i7^bH79_yAgiif5U45t&!0< zC@*tN9ZoUy$ZP5kuQM*G)6+lLOcQ&y1R7Blz--*eOu6i1KHLx~-jwhrJ zhQ*|rXU)*YS<5KSzsA6L+rXM>V+SS^siFlCDOIYQT(m@48JWI8uL9&qBX;HU#Kw;x z8aFMP)LzP8^l2`I!4P{mfBXiKncO2!&^|>2D>*+|HYVn@?9fX?m1zheR+bI&^qD1Y zR`@c^xN+phLIaK&I7YPZBt zg@=18Z~#6;&7la+@w$|*bVr9#W(evw?GT^M8~wAt{KuI;)KM-Af8Qqa*~LjaI&z4g z@Cc@DMn8Bd^=8s<^!hzNHH8lykDX^wh{ z#O0z*#w%-3P@Aq^zJljb)y=orRV#Of#6ZnC6l`E}u*!VI zraLyjW%JctbYw#If7j84Vg=sxvx|P3qZH?JQIPA)D_@#EeLg!mA!Tm2SC3j=oUI02 zx2JcfR|DKa@=ck8*eB*+9*l%#t5%iAk%&0bQjn#a9q*cS0lkcLLnde?hprJ9!XoZ*M-eK)+bK zj|{g$T?A7kMfLX1S_C&W8+L#18V`q1C4An?aqOJ!uym0z7y$y2N*{k&?;Y>aNi z$v|}mA~XmrYyI6uAgUc=Bo+%P$S_csi|I@$y1B1~9=1W4mhq}G)FA6ElLY~8f32C^ zEo}5b8<<>yXN8sKf72atnHm7E0wIEpj(kHGDr;OwV?*@ZDSHCMzfsIB3YG~q)=4n; zKBbg%4Q-p1O8$V@kgQ^D4=E^UNlHjqhP^>@s`Kf&FbDhFhp0fDJNP@be}v8Sp3At> z5t;xF`XN&CXBQn(rkD2jkMB40fBfL*G6BX*IgeE0~ak z#w=+ELQJuD80Ml53=5Ss^UVzr5I|TkU!HUv0`0I1%y;YsMQM$J#ioZUd8LUR3eIjr zY0ixm_n!2R46$C<;W&l|>`~eF5)%nLHSlsngRx$xIV_m<7%7Z~9G24pcRe6b3`=;qV(oW^#`_e?j{c4cyB4$*M6ir)7s;8LCV}2(hwkkf%>9al^rvVaAOk zHx?Rj%+SK-<8Nkii)z@Ny)kA1KUcTc%P|t=hJ?|3nD9E*nhTF2c~!e5hAKSVQ-Krj zA!-gqaE_l#=}LEW7-fc_ZqrWf*}Ty|`^$eU{GpC=S@<@Y&(2TUf6o z%jd|1?C+mT#R|OIe`n|YG)F1U=b|827ni;?efoTMaze`7uB#rkI$zulxNc7GPA>SO~ob23YQv?V634hM9VwiI7MwA~4fEZhljLkHpRF5yA6f4xQd)B^ot?Y=793UwY# zkrdV2J8K!-)NI)Oy=y!iLY45D!#0;Zu@l)}e7af}%wUa%8yi|p*vL{CwQru?=`!vC zj&OXC#_XIYgHyJv3uw)Om5#LAx^mvhW0RN@`v zq9HCFp@O9+H_aVosLb=<+kTzkYF*msp{JomHZpy##O)ig;hPKCKr)$PxxjAnUfw;= Po#^&ISGiGa7Z3md4X!jm diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc index 22e7bcdd16c6ec66d485446413e2be4753ae5ad2..6d825bf424557eb3ac139e6d12e0964c35d5993b 100644 GIT binary patch literal 24 fcmYc;N@ieSU}Dg}=E8V3gPH%fp|Gx-l2{Y~Nx}v2 literal 24 gcmYc;N@ieSU}Cr`**RxNS|%?6U?j?B zqiZQTn>5gW-=RcNBt>5V`qH8(s^M@Baya}Xzj-N6h&S-A(~!K7@b3O{mXqMf^F8!L zUcJFLFV2Eh0na#&NlXgw0fvGsB!N)$25*gp`D;$Xks1IkB1ACqVSIrLGUIF&@VCm` zzz_vl%IQMLXo}gZpJxT-G)qBpo$~%s=}X@r7&3-mze&vppDp@ls5ZHuMOM&o&0S;!(u~6nR2G_uR9c7=mY^zrnO>#x z;FA|vCp}jxN~5vBGp-UWeVTH%yEY>cct`HsaGWn;Rf~c^h=#=+^JNB$M+3@V2jfYc z;mTZ`MVt_`lxP9f5TO(=;4fIovY6mhsG^9*kg_A=(l7Fu285ACT_T!2^1=xu6rws) z8lYMi)yn7@Gm2BLJeB@Rq?bb8JY%*kT5y^!z*fZI%%Ch|QPC`w8qHQbi#92&bZJDN z3Aoe?R@?7pR7OOty%N=-HsEvi13!)6;6To%5VT3U;7cXFgh()WnrEZSS>tPj^Hx6N znDJ&r7iW1R9Y51hgxl`YFy^&_bR(ag-X>y!{-S(Y$>fB}t5ZEC|IOyNQ|B0tIWktF z5ujM6UF$*lqV*ocs9~iMm{!WNeDGz}=;i+cEviyEJ@ywIfdg)J$>FGRHT{ z_s~e+?Z2wH_xgvHPl(lCWfzqN)osk-V_HPH5$lxE1amUHw)Vu2;F?Gh^yTwSvz?a0 zXn1*v@+Bi-E1O+U=CPf2$ zF{XAKBGD!`7c9cma2}GFV|y*6s`EaB1Sr(6VPq^M*0z?uU?J$T<`JzR@>=g(5%>~! z>QqAv+BTaHYca%Vf2*or)Dc2k$>LC46Y7dCR*wngu(4VbllTfDgcX=}hoE^7CWc&H zZA~nt56q70SWb^i!Qju5!43Mp5&!=7>29>_pZ~7lx^@AIupkCmYwBme*W_GlZ3Aaz z(MD!Dy(MM?nB{20%kr_tW$htv9!#K=LYIwl*;t0S+;$J^AL zU@Sv3x!8%sHW52ncnu2cIawgQCjz?(cr*KMLtiKPZUo=8+^Zw6Tv~&8b(sQQDKvp^ zE$%vL*MeP}b)^R<>N-eQsyi@r1LsMZHF|IUWBd(KhrEUed7T>m)nF3pN zsp(j&Q*F~a6x8mCv_?;Q$GLZtY#r53AZrA$-CqY_m8*^XYLTl6t{rA=7PUcEC$YA$ zsPTqE#g;qPgQ~@e1}3^Gbq}9w;B*I{l1$eyso5bVk*7D>!k{LCb`Gp#U)t>uu&9G)hdDLmw20FHPJ=g1+%#yj1Dg)c+>Dt9WjX+}!a%4-9kEYzj=jtdl8A%KU~6p>JkR8!IJeLoTh}mLGSSJuvfOfZG!(f zC=5N=aSz^lcQtc-bbNZ;w=9k3!q%v=MbjW&g=AL82FVj88w)o=Qi^mi=S+Aj%l`pS KUl|0O9RL6XmUD{$ literal 1815 zcmV+y2k7`8iwFP!000000Nt5wQ`U8kl8Bv6`fA%wz|mJG>oH`ie_iY+H9 zemhy=NE!e4TiLQK$x1?)>6dnxPP|(0i?mujwqLvyC&U|g*J((eNqBdEG0jPE?D-yg zBx`T*-HWqeRlqflV-l0Xdw`)J3rQdpy}?UkVYbdmI8pOF3N#8BH-;`*~JSPO}sw*D3EEm%j83f+1sg{atE4_-xVph-#DjS!4wb2hDJi zF-}6n8BU9U(VRnqX+i#7ku(6W%4zEWCp66v4ro{?D*;Zklm<8+)J}YuB`~AL7bGP} z^SWF-6@hV;t1B%O?T!S4g#uZW4_E_@@PQ$dN96> zGhCUAvxpO7mJ%(X8X}b91^j`PEQ<+Fg(`|@3@JM@F8v~pX+RiR)FqbLLsU% zr2(pSQLT)gFrzr-%2VmDM0zRY%`;}(q6Mev0&GPL_6*8078T7@snKl3vuK;bN|#3T ziGWMJV72vbMrB0QS}RcFrWX&|j1Nb z{AHG?oi-X?U$lK`NrFcx*htuuQ!P6lRPibnmAXa|!Fr-m_dJp|O>406`Y@29whs z3o9u$4`(FK4#i-S~cdV+$u)yv~;-Jw{je!#jKQemtG^ZQ<9+s3ThO3HW!2 ztL8kzg*c*ep`sNI^uUCe<)$?Q=h|s2c15_dM4oYAt-M@L&4_MrgofddwjfYzth&`%&*Dx{`5^GsYU$78#S#yh45P7Zltq6RH zGj*aN25pa1jtO-{7pr+fIV`N!#3a5#2w?@L-63cmgoz=44?Rnu)?r z3brBGiN71@SI@~}-#w|<4ZYiOcNgtCVRtL*ZbV(3a^=(-kgLlS<4U2)cpC}V3Ah&D z+ITBHIO)~_w^H3fo?Fn?B3o5hCfIJ}S_^BX37cx2P%CvSrnQi^J--yos&h;ySsi4X z)}c6dPl`2q(i`@@8(!6ptXlRh38%+1`xq(Mt;Zwrs1`IVjq=e87{J9H19rWpdPZxV`MNb!bcEQulojb78 zq|VNUb*xLf9D)yZ*6d)WMwu358jNWmrb(CvV0QS@ftK6J(f~^bQ+ANjrpcW!Y3%+E zx#%p{8;U)c6RF z4W+pdPU%ljI6XP(opx{gr{H0A>z?-KH6@fZOkgJ1VTy!3&l3c^(&8V(sQ z;KX)4EVP+uc^CyO;@|hlLPgby$%^JTsXy4MDUQEE*}ScuDu1kcS>mXms$d1s12UV7 z-L1n~@~dmE_QD9X+t3};o@J;ez$m?L;&$40()$;xE0qn2{EoE?llI292h|+3vtwE* zJz+eDXN*fKz9A02d*b5Y@Zhl5?|nRt4iEal>HMHShhJx(j*budhrN^V^yFy%>Fl`r zITPN>@_&hS)zpI> F003c(c;)~A diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-034376f0-4c6b-4bf0-8912-4035d651b982.crc deleted file mode 100644 index 452d210694c39f4ccd05ec80f57d9f0c9464151a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}BgyKUUu-^pp<(BAo=Y diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc new file mode 100644 index 0000000000000000000000000000000000000000..b43af1575e5d13dd89e237f35f3f4fe513ea0cfb GIT binary patch literal 16 XcmYc;N@ieSU}8{b`&4CK+xip$ARYwr literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-034376f0-4c6b-4bf0-8912-4035d651b982 deleted file mode 100644 index de784fadf70f331163ac2c213fd9b628fe635c38..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 928 zcmV;R17G}$0{{T93IG5owJ-f(unFxO0H&yxOVHA)10c2yfW8r;q5J_#KF*vw8u246 zIV(Ca6wRJfKmf7Q*5Np`^w1J@K-mi4{LjQkHnx;oN-3qJCp+OrzNQ%uIl6L&py@4B zd;oL+jQ}Ryv$*Z;8LQmNS>{*FkzCZf+Kf|d(8@muWrcj|DT@_t-kM`uE3T%-D)G_L zRE>W+o9H92Xm93Yt~RzBZNROZ^<1r(Bel3~-FDZTtwwWYyhm>9X{*sncy7QL&kTs7 zG%*{9$^WM`5e7?dbVfhyB$Vv0LC*9vqycoqIcIzmJ`MlrXT5LL=1hc=C__D_A3b6Z zauh65y1`~Jdu_)ZW>*|rT=l)Ht#2PkZg=J5t{h*??RJfrvaTbqc4x~iC#k*OvEos6 zvLenq^Q^Dkc9$qOEIx~y=c_HQrlw+r*wJDUTYNLVR_|S}$tt@a-hr2d2R~iUug9R} z6(P3+>Ug^I<@@z=3v_5;o>Fe(C^XPwS>E&^52Qai>F1CN8ROf%TpmROsGF!VF8?yN z3oyzUGZ0D^9Un>%F*~G%R?jT@myIW0$N&HZ>HmLM;9!Luagp>dso;(-gu=r=SzslA zyg?A+QARjDg26z)|&s%qG`Yp%7SL(SX%L?6Su)pAah=LFBMmjB1tvY zsL(=oNZ|7#sh|&@Goi%)I3tVc9FSZe07q$Q+#sWc{(i$~1OcIN$%LJ-yQ;3%B~hKe zMO(O)VwR^)vC-Cy%Ph~T%^Wax=71<_qbBaeU0jJK%3!HU&!yGx=yrBLy`S#|?D1Al zUob8@9bkZosIe_FG$SKHlq5+C>F`3Kq3IF;;#dY_8IoZU2oW(tVw9xfhF@d0#x!c^M( zV*s(iL|gkKOl~ln3}!Y_2-8)D*TM>tO8(9hk@oYS@COY500000001bpFa00@0RRBl Cu)*>G diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 new file mode 100644 index 0000000000000000000000000000000000000000..c750e490270b939409c7dad9c1e864d19a6d15b5 GIT binary patch literal 929 zcmV;S177@$0{{TB3IG5owJ-f(vI+GY0LJK+OVHA)10c2yfW8r;q5J_#KF*vw8u246 zIV(Ca6wRJfKmf7Q*5Np`^w1J@K-mi4{LjQkHnx;oN-3qJCp+OrzNQ%uIl6L&py@4B zd;oL+jQ}Ryv$*Z;8LQmNS>{*FkzCZf+Kf|d(8@muWrcj|DT@_t-kM`uE3T%-D)G_L zRE>W+o9H92Xm93Yt~RzBZNROZ^<1r(Bel3~-FDZTtwwWYyhm>9X{*sncy7QL&kTs7 zG%*{9$^WM`5e7?dbVfhyB$Vv0LC*9vqycoqIcIzmJ`MlrXT5LL=1hc=C__D_A3b6Z zauh65y1`~Jdu_)ZW>*|rT=l)Ht#2PkZg=J5t{h*??RJfrvaTbqc4x~iC#k*OvEos6 zvLenq^Q^Dkc9$qOEIx~y=c_HQrlw+r*wJDUTYNLVR_|S}$tt@a-hr2d2R~iUug9R} z6(P3+>Ug^I<@@z=3v_5;o>Fe(C^XPwS>E&^52Qai>F1CN8ROf%TpmROsGF!VF8?yN z3oyzUGZ0D^9Un>%F*~G%R?jT@myIW0$N&HZ>HmLM;9!Luagp>dso;(-gu=r=SzslA zyg?A+QARjDg26z)|&rMSu_neLRrv^97`+ybmBHR3S`a;@1^1@QY5Ly z8Wmc|4hei-Bo*|*b0(DdA7^ASodc5V1K=nvjT>aN(BE$ujUXTtE}5_sc30Kax+JR8 zw`dEuQq1zyDK^@gahc^=wV4CP&KwX$ZPdh_xQi>%L>VkK>ACc)-_h;tetJLO3)thW zp1xpQbUMI*iKxLXGBhJ2K}rge6w(3kLZPAQ5&+^@24fkLVG#%sF+yULsd!{$f^h@Y zqTFlJbzJceoyQvwG;*F$nJo6mw-gRWMLfrcMs*W^FU z@ihhz8%(sdf5OBD`(#|Ri9(pJGGG=~yHxUbo`|$d|Aapc000000000ewJ-f300961 D1I@IY literal 0 HcmV?d00001 From fcaba23d1e39b8b659e0b4a1ea074d6f7988df59 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 21 Apr 2025 20:32:52 -0400 Subject: [PATCH 24/55] test transcripts --- .../reference_datasets/reference_dataset.py | 2 +- .../lib/tasks/base/base_write_parquet.py | 1 + v03_pipeline/lib/tasks/exports/misc.py | 2 +- v03_pipeline/lib/tasks/exports/misc_test.py | 3 + .../exports/write_new_transcripts_parquet.py | 2 +- .../write_new_transcripts_parquet_test.py | 57 ++++++++++++++++++- .../exports/write_new_variants_parquet.py | 9 +-- v03_pipeline/lib/test/misc.py | 11 ++++ 8 files changed, 77 insertions(+), 10 deletions(-) create mode 100644 v03_pipeline/lib/test/misc.py diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py index f56a8cc88..7dac429ae 100644 --- a/v03_pipeline/lib/reference_datasets/reference_dataset.py +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -68,7 +68,7 @@ def for_reference_genome_dataset_type_private( reference_genome, dataset_type, ) - if self.access_control == AccessControl.PRIVATE + if dataset.access_control == AccessControl.PRIVATE } @classmethod diff --git a/v03_pipeline/lib/tasks/base/base_write_parquet.py b/v03_pipeline/lib/tasks/base/base_write_parquet.py index 87c16e876..24f937ada 100644 --- a/v03_pipeline/lib/tasks/base/base_write_parquet.py +++ b/v03_pipeline/lib/tasks/base/base_write_parquet.py @@ -12,6 +12,7 @@ def run(self) -> None: ht = self.create_table() ht, _ = checkpoint(ht) df = ht.to_spark(flatten=False) + df = df.withColumnRenamed('key_', 'key') df.write.parquet( self.output().path, mode='overwrite', diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 4cb4da9e8..c3d10d797 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -36,7 +36,7 @@ def array_structexpression_fields(ht: hl.Table): ht[field], hl.expr.expressions.typed_expressions.ArrayStructExpression, ) - ] + ], ) diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index 0b719fcad..db2be3c2d 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -47,6 +47,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: self.assertEqual( ht.collect()[0], hl.Struct( + key_=0, locus=hl.Locus( contig='chr1', position=939121, @@ -192,6 +193,7 @@ def test_unmap_reference_dataset_annotation_enums(self) -> None: 'sorted_motif_feature_consequences', ).collect()[0], hl.Struct( + key_=0, locus=hl.Locus( contig='chr1', position=939121, @@ -294,6 +296,7 @@ def test_camelcase_array_structexpression_fields(self) -> None: self.assertEqual( ht.collect()[0], hl.Struct( + key_=0, locus=hl.Locus( contig='chr1', position=939121, diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 932c50baa..126085093 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -43,7 +43,7 @@ def requires(self) -> list[luigi.Task]: else self.clone(WriteNewVariantsTableTask), ] - def run(self) -> None: + def create_table(self) -> None: ht = hl.read_table( new_variants_table_path( self.reference_genome, diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 89b346d6b..7547dedf7 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -13,6 +13,7 @@ from v03_pipeline.lib.tasks.exports.write_new_transcripts_parquet import ( WriteNewTranscriptsParquetTask, ) +from v03_pipeline.lib.test.misc import convert_ndarray_to_list from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase TEST_SNV_INDEL_ANNOTATIONS = ( @@ -78,6 +79,56 @@ def test_write_new_transcripts_parquet_test( worker.run() self.assertTrue(task.output().exists()) self.assertTrue(task.complete()) - - -pd.read_parquet('example_pa.parquet', engine='pyarrow') + df = pd.read_parquet( + os.path.join( + new_transcripts_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + self.assertListEqual( + list(export_json[0].keys()), + ['key', 'transcripts'] + ) + self.assertEqual( + export_json[0]['key'], + 0, + ) + self.assertEqual( + export_json[0]['transcripts'][0]['_0'], + 'ENSG00000187634', + ) + self.assertEqual( + export_json[0]['transcripts'][0]['_1'][0], + { + 'aminoAcids': 'S/L', + 'canonical': 1.0, + 'codons': 'tCg/tTg', + 'geneId': 'ENSG00000187634', + 'hgvsc': 'ENST00000616016.5:c.1049C>T', + 'hgvsp': 'ENSP00000478421.2:p.Ser350Leu', + 'transcriptId': 'ENST00000616016', + 'maneSelect': 'NM_001385641.1', + 'manePlusClinical': None, + 'exon': {'index': 6, 'total': 14}, + 'intron': None, + 'refseqTranscriptId': 'NM_001385641.1', + 'alphamissense': {'pathogenicity': None}, + 'loftee': {'isLofNagnag': None, 'lofFilters': None}, + 'spliceregion': { + 'extended_intronic_splice_region_variant': False, + }, + 'utrannotator': { + 'existingInframeOorfs': None, + 'existingOutofframeOorfs': None, + 'existingUorfs': None, + 'fiveutrAnnotation': None, + 'fiveutrConsequence': None, + }, + 'biotype': 'protein_coding', + 'consequenceTerms': ['missense_variant'], + }, + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 5d3b8eba8..749745314 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -2,11 +2,11 @@ import luigi import luigi.util -from v03_pipeline.lib.model import FeatureFlag from v03_pipeline.lib.paths import ( new_variants_parquet_path, new_variants_table_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import BaseReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -46,7 +46,7 @@ def requires(self) -> list[luigi.Task]: else self.clone(WriteNewVariantsTableTask), ] - def run(self) -> None: + def create_table(self) -> None: ht = hl.read_table( new_variants_table_path( self.reference_genome, @@ -137,8 +137,9 @@ def run(self) -> None: **{f: ht[f] for f in array_structexpression_fields(ht)}, **{ rd: ht[rd] - for rd in BaseReferenceDataset.for_reference_genome_dataset_type( - self.reference_genome, self.dataset_type + for rd in BaseReferenceDataset.for_reference_genome_dataset_type_private( + self.reference_genome, + self.dataset_type, ) }, ) diff --git a/v03_pipeline/lib/test/misc.py b/v03_pipeline/lib/test/misc.py new file mode 100644 index 000000000..328123915 --- /dev/null +++ b/v03_pipeline/lib/test/misc.py @@ -0,0 +1,11 @@ +import numpy as np + + +def convert_ndarray_to_list(obj): + if isinstance(obj, np.ndarray): + return [convert_ndarray_to_list(item) for item in obj.tolist()] + if isinstance(obj, dict): + return {k: convert_ndarray_to_list(v) for k, v in obj.items()} + if isinstance(obj, list): + return [convert_ndarray_to_list(item) for item in obj] + return obj From dd0c02402848593f6d8bf8eed4837789f88415c4 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 21 Apr 2025 22:27:23 -0400 Subject: [PATCH 25/55] closer --- v03_pipeline/lib/tasks/exports/misc.py | 6 +- .../write_new_transcripts_parquet_test.py | 5 +- .../write_new_variants_parquet_test.py | 112 +++++++++++++++++- 3 files changed, 115 insertions(+), 8 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index c3d10d797..051db90bf 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -60,13 +60,13 @@ def subset_filterable_transcripts_fields( field_name = transcripts_field_name(reference_genome, dataset_type) return ht.annotate( **{ - field_name: ht[field_name].map( - lambda c: c.select( + field_name: hl.enumerate(ht[field_name]).starmap( + lambda idx, c: c.select( **{ new_nested_field_name: parse_nested_field( ht[field_name], existing_nested_field_name, - ) + )[idx] for new_nested_field_name, existing_nested_field_name in dataset_type.export_parquet_filterable_transcripts_fields( reference_genome, ).items() diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 7547dedf7..8c0e651dc 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -89,10 +89,7 @@ def test_write_new_transcripts_parquet_test( ), ) export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) - self.assertListEqual( - list(export_json[0].keys()), - ['key', 'transcripts'] - ) + self.assertListEqual(list(export_json[0].keys()), ['key', 'transcripts']) self.assertEqual( export_json[0]['key'], 0, diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index 2503851af..90c294b5c 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -1,15 +1,19 @@ +import os + import hail as hl import luigi.worker +import pandas as pd from v03_pipeline.lib.model import ( DatasetType, ReferenceGenome, SampleType, ) -from v03_pipeline.lib.paths import new_variants_table_path +from v03_pipeline.lib.paths import new_variants_parquet_path, new_variants_table_path from v03_pipeline.lib.tasks.exports.write_new_variants_parquet import ( WriteNewVariantsParquetTask, ) +from v03_pipeline.lib.test.misc import convert_ndarray_to_list from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase TEST_SNV_INDEL_ANNOTATIONS = ( @@ -53,3 +57,109 @@ def test_write_new_variants_parquet_test( worker.run() self.assertTrue(task.output().exists()) self.assertTrue(task.complete()) + df = pd.read_parquet( + os.path.join( + new_variants_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + export_json[0]['sortedTranscriptConsequences'] = [ + export_json[0]['sortedTranscriptConsequences'][0], + ] + self.assertEqual( + export_json, + [ + { + 'key': 0, + 'xpos': 1000939121, + 'chrom': 'chr1', + 'pos': 939121, + 'ref': 'C', + 'alt': 'T', + 'variantId': '1-939121-C-T', + 'rsid': None, + 'CAID': 'CA502654', + 'liftedOverChrom': '1', + 'liftedOverPos': 874501, + 'screenRegionType': None, + 'predictions': { + 'cadd': 23.5, + 'eigen': 2.628000020980835, + 'fathmm': 0.7174800038337708, + 'gnomad_noncoding': None, + 'mpc': 0.01291007362306118, + 'mut_pred': None, + 'mut_tester': 'D', + 'polyphen': 0.164000004529953, + 'primate_ai': 0.5918066501617432, + 'revel': 0.3109999895095825, + 'sift': 0.0010000000474974513, + 'splice_ai': 0.0, + 'splice_ai_consequence': 'No consequence', + 'vest': 0.39500001072883606, + }, + 'populations': { + 'exac': { + 'ac': 20, + 'af': 0.00019039999460801482, + 'an': 47974, + 'filter_af': 0.0007150234305299819, + 'hemi': None, + 'het': 20, + 'hom': 0, + }, + 'gnomad_exomes': { + 'ac': 964, + 'af': 0.0006690866430290043, + 'an': 1440770, + 'filter_af': 0.0008023773552849889, + 'hemi': 0, + 'hom': 0, + }, + 'gnomad_genomes': { + 'ac': 42, + 'af': 0.0002759889466688037, + 'an': 152180, + 'filter_af': 0.0005293028079904616, + 'hemi': 0, + 'hom': 0, + }, + 'topmed': { + 'ac': 41, + 'af': 0.00032651599030941725, + 'an': 125568, + 'het': 41, + 'hom': 0, + }, + }, + 'sortedMotifFeatureConsequences': [ + { + 'consequenceTerms': ['TF_binding_site_variant'], + 'motifFeatureId': 'ENSM00493959715', + }, + ], + 'sortedRegulatoryFeatureConsequences': [ + { + 'biotype': 'CTCF_binding_site', + 'consequenceTerms': ['regulatory_region_variant'], + 'regulatoryFeatureId': 'ENSR00000344437', + }, + ], + 'sortedTranscriptConsequences': [ + { + 'alphamissense': None, + 'canonical': 1.0, + 'consequenceTerms': ['missense_variant'], + 'extended_intronic_splice_region_variant': False, + 'fiveutrConsequence': None, + 'geneId': 'ENSG00000187634', + }, + ], + 'hgmd': None, + }, + ], + ) From 23ee29a70b3c3a47d5c9bceb8fb47c87b12b7956 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 22 Apr 2025 11:42:15 -0400 Subject: [PATCH 26/55] missed some tests --- ...nnotations_table_with_updated_reference_dataset_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 8c161b65c..29f0a11e4 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -91,6 +91,7 @@ def test_create_empty_annotations_table(self): **BASE_ENUMS, ), migrations=[], + max_key_=-1, updates=set(), ), ], @@ -125,6 +126,7 @@ def test_update_vat_snv_indel_38( enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), + max_key_=0, ), ) @@ -173,6 +175,7 @@ def test_update_vat_snv_indel_38( ), migrations=[], updates=set(), + max_key_=0, ), ], ) @@ -276,6 +279,7 @@ def test_update_vat_mito_38( enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), + max_key_=0, ), ) @@ -325,6 +329,7 @@ def test_update_vat_mito_38( ), migrations=[], updates=set(), + max_key_=0, ), ], ) @@ -404,6 +409,7 @@ def test_update_vat_snv_indel_37( enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), + max_key_=0, ), ) @@ -452,6 +458,7 @@ def test_update_vat_snv_indel_37( ), migrations=[], updates=set(), + max_key_=0, ), ], ) From 81e9ba410953b8e81a1635a0151ff407b9508acf Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 22 Apr 2025 12:20:24 -0400 Subject: [PATCH 27/55] add a parquet reader requirement --- requirements-dev.in | 1 + requirements-dev.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/requirements-dev.in b/requirements-dev.in index 43b9efcbd..cdf3a4ca3 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -7,3 +7,4 @@ responses>=0.23.1 ruff>=0.1.8 shellcheck-py>=0.10.0 pysam +pyarrow diff --git a/requirements-dev.txt b/requirements-dev.txt index cf969452f..8f2e93a4f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -59,6 +59,8 @@ packaging==24.2 # sphinx pip-tools==7.4.1 # via -r requirements-dev.in +pyarrow==19.0.1 + # via -r requirements-dev.in pygments==2.19.1 # via # -c requirements.txt From bd671200deaade48d7e78a8946be28f7979644b0 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 22 Apr 2025 14:43:14 -0400 Subject: [PATCH 28/55] use project tables --- .../lib/tasks/exports/write_new_entries_parquet.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index f1aac53b1..966a0b61b 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -16,10 +16,8 @@ from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( UpdateNewVariantsWithCAIDsTask, ) +from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask -from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( - WriteRemappedAndSubsettedCallsetTask, -) @luigi.util.inherits(BaseLoadingRunParams) @@ -44,9 +42,9 @@ def requires(self) -> list[luigi.Task]: UpdatedReferenceDatasetQueryTask, reference_dataset_query=ReferenceDatasetQuery.high_af_variants, ), - 'remapped_and_subsetted_callsets': [ + 'project_tables': [ self.clone( - WriteRemappedAndSubsettedCallsetTask, + UpdateProjectTableTask, project_i=i, ) for i in range(len(self.project_guids)) From 7768409e7d0934340729892b79a95c0a3b36ad7c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 08:49:54 -0400 Subject: [PATCH 29/55] finish off first pass at test! --- v03_pipeline/lib/misc/family_entries.py | 26 +++- .../exports/write_new_entries_parquet.py | 119 +++++++++++++++--- .../exports/write_new_entries_parquet_test.py | 77 ++++++++++++ .../lib/tasks/update_project_table.py | 21 +--- ...ate_project_table_with_deleted_families.py | 25 ++-- .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 12 -> 12 bytes .../SNV_INDEL/annotations.ht/README.txt | 2 +- .../globals/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../annotations.ht/globals/metadata.json.gz | Bin 762 -> 761 bytes .../annotations.ht/globals/parts/.part-0.crc | Bin 104 -> 24 bytes .../annotations.ht/globals/parts/part-0 | Bin 11896 -> 1571 bytes .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../annotations.ht/rows/.metadata.json.gz.crc | Bin 24 -> 24 bytes .../annotations.ht/rows/metadata.json.gz | Bin 1820 -> 1820 bytes ...-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc} | Bin ...rt-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1} | Bin 19 files changed, 216 insertions(+), 54 deletions(-) create mode 100644 v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx => part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx}/.index.crc (100%) rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx => part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx => part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx}/index (100%) rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx => part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx}/metadata.json.gz (100%) rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/{.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc => .part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc} (100%) rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/{part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 => part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1} (100%) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index d4ed9811c..9cbd7831f 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -1,6 +1,30 @@ import hail as hl -from v03_pipeline.lib.model import DatasetType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +def initialize_project_table( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +): + key_type = dataset_type.table_key_type(reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + filters=hl.tset(hl.tstr), + # NB: entries is missing here because it is untyped + # until we read the type off of the first callset aggregation. + ), + key=key_type.fields, + globals=hl.Struct( + family_guids=hl.empty_array(hl.tstr), + family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), + ), + ) def compute_callset_family_entries_ht( diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 966a0b61b..ca33c1dfd 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -1,10 +1,19 @@ +import hail as hl import luigi import luigi.util +from v03_pipeline.lib.annotations.expression_helpers import get_expr_for_xpos +from v03_pipeline.lib.annotations.fields import get_fields +from v03_pipeline.lib.misc.family_entries import ( + compute_callset_family_entries_ht, +) from v03_pipeline.lib.paths import ( new_entries_parquet_path, ) -from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + BaseReferenceDataset, + ReferenceDatasetQuery, +) from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -13,11 +22,12 @@ from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_query import ( UpdatedReferenceDatasetQueryTask, ) -from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( - UpdateNewVariantsWithCAIDsTask, +from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( + UpdateVariantAnnotationsTableWithNewSamplesTask, +) +from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( + WriteRemappedAndSubsettedCallsetTask, ) -from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask -from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask @luigi.util.inherits(BaseLoadingRunParams) @@ -33,23 +43,98 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: return { - 'annotations': ( - self.clone(UpdateNewVariantsWithCAIDsTask) - if self.dataset_type.should_send_to_allele_registry - else self.clone(WriteNewVariantsTableTask) + 'annotations_table_task': ( + self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask) ), - 'high_af_variants': self.clone( - UpdatedReferenceDatasetQueryTask, - reference_dataset_query=ReferenceDatasetQuery.high_af_variants, - ), - 'project_tables': [ + 'remapped_and_subsetted_callset_tasks': [ self.clone( - UpdateProjectTableTask, + WriteRemappedAndSubsettedCallsetTask, project_i=i, ) for i in range(len(self.project_guids)) ], + **( + { + 'high_af_variants_table_task': self.clone( + UpdatedReferenceDatasetQueryTask, + reference_dataset_query=ReferenceDatasetQuery.high_af_variants, + ), + } + if ReferenceDatasetQuery.high_af_variants + in BaseReferenceDataset.for_reference_genome_dataset_type( + self.reference_genome, + self.dataset_type, + ) + else {} + ), } - def run(self) -> None: - pass + def create_table(self) -> None: + unioned_ht = None + for project_guid, remapped_and_subsetted_callset_task in zip( + self.project_guids, + self.input()['remapped_and_subsetted_callset_tasks'], + strict=True, + ): + mt = hl.read_matrix_table(remapped_and_subsetted_callset_task.path) + ht = compute_callset_family_entries_ht( + self.dataset_type, + mt, + get_fields( + mt, + self.dataset_type.genotype_entry_annotation_fns, + **self.param_kwargs, + ), + ) + ht = ht.annotate( + family_entries=hl.enumerate(ht.family_entries).starmap( + lambda i, fs: hl.enumerate(fs).starmap( + lambda _, e: e.annotate( + family_guid=ht.family_guids[i], # noqa: B023 + ), + ), + ), + ) + annotations_ht = hl.read_table( + self.input()['annotations_table_task'].path, + ) + ht = ht.join(annotations_ht) + if self.input().get('high_af_variants_table_task'): + gnomad_high_af_ht = hl.read_table( + self.input()['high_af_variants_table_task'].path, + ) + ht = ht.join(gnomad_high_af_ht) + + # the family entries ht will contain rows + # where at least one family is defined... after explosion, + # those rows should be removed. + ht = ht.explode(ht.family_entries) + ht = ht.filter(hl.is_defined(ht.family_entries)) + + ht = ht.key_by() + ht = ht.select( + project_guid=project_guid, + family_guid=ht.family_entries.family_guid[0], + is_gnomad_gt_5_percent=ht.is_gt_5_percent, + key=ht.key_, + sample_ids=ht.family_samples[ht.family_entries.family_guid[0]], + xpos=get_expr_for_xpos(ht.locus), + filters=ht.filters, + sample_type=self.sample_type.value, + GQ=ht.family_entries.GQ, + AB=ht.family_entries.AB, + DP=ht.family_entries.DP, + GT=ht.family_entries.GT.map( + lambda x: hl.case() + .when(x.is_hom_ref(), 0) + .when(x.is_het(), 1) + .when(x.is_hom_var(), 2) + .default(hl.missing(hl.tint32)), + ), + ) + if not unioned_ht: + unioned_ht = ht + else: + unioned_ht = unioned_ht.union(ht) + return unioned_ht + diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py new file mode 100644 index 000000000..372a5cde6 --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -0,0 +1,77 @@ +import os + +import hail as hl +import luigi.worker +import pandas as pd + +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, + SampleType, +) +from v03_pipeline.lib.paths import ( + new_entries_parquet_path, + variant_annotations_table_path, +) +from v03_pipeline.lib.tasks.exports.write_new_entries_parquet import ( + WriteNewEntriesParquetTask, +) +from v03_pipeline.lib.test.misc import convert_ndarray_to_list +from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( + MockedReferenceDatasetsTestCase, +) + +TEST_RUN_ID = 'manual__2024-04-03' + +TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv' +TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv' +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' + + +class WriteNewEntriesParquetTest(MockedReferenceDatasetsTestCase): + def setUp(self) -> None: + super().setUp() + ht = hl.read_table( + TEST_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + variant_annotations_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ), + ) + + def test_write_new_entries_parquet(self): + worker = luigi.worker.Worker() + task = WriteNewEntriesParquetTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project', 'R0114_project4'], + project_pedigree_paths=[TEST_PEDIGREE_3_REMAP, TEST_PEDIGREE_4_REMAP], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + df = pd.read_parquet( + os.path.join( + new_entries_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + self.assertListEqual(list(export_json[0].keys()), ['key', 'transcripts']) + self.assertEqual( + export_json[0]['key'], + 0, + ) diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index 6cec6e9ba..9540793d6 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -5,6 +5,7 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.misc.family_entries import ( compute_callset_family_entries_ht, + initialize_project_table, join_family_entries_hts, remove_family_guids, ) @@ -50,23 +51,9 @@ def requires(self) -> luigi.Task: return self.clone(WriteRemappedAndSubsettedCallsetTask) def initialize_table(self) -> hl.Table: - key_type = self.dataset_type.table_key_type(self.reference_genome) - return hl.Table.parallelize( - [], - hl.tstruct( - **key_type, - filters=hl.tset(hl.tstr), - # NB: entries is missing here because it is untyped - # until we read the type off of the first callset aggregation. - ), - key=key_type.fields, - globals=hl.Struct( - family_guids=hl.empty_array(hl.tstr), - family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set( - hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), - ), - ), + return initialize_project_table( + self.reference_genome, + self.dataset_type, ) def update_table(self, ht: hl.Table) -> hl.Table: diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py index 56277f34b..e2f5373ab 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py @@ -1,7 +1,10 @@ import hail as hl import luigi -from v03_pipeline.lib.misc.family_entries import remove_family_guids +from v03_pipeline.lib.misc.family_entries import ( + initialize_project_table, + remove_family_guids, +) from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask @@ -40,23 +43,9 @@ def complete(self) -> bool: ) def initialize_table(self) -> hl.Table: - key_type = self.dataset_type.table_key_type(self.reference_genome) - return hl.Table.parallelize( - [], - hl.tstruct( - **key_type, - filters=hl.tset(hl.tstr), - # NB: entries is missing here because it is untyped - # until we read the type off of the first callset aggregation. - ), - key=key_type.fields, - globals=hl.Struct( - family_guids=hl.empty_array(hl.tstr), - family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set( - hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), - ), - ), + return initialize_project_table( + self.reference_genome, + self.dataset_type, ) def update_table(self, ht: hl.Table) -> hl.Table: diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc index bf6256713d879829918ffe9996699c1bdcd22c58..73363121b99513464789a5b6ef363643c72a5c0c 100644 GIT binary patch literal 12 TcmYc;N@ieSU}9MQe8*A%6y^jb literal 12 TcmYc;N@ieSU}C7aInNFN6J-Nv diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt index 76c4a0d1b..81b15aae7 100644 --- a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt +++ b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2025/04/21 16:05:23 \ No newline at end of file + Created at 2025/04/24 00:34:36 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc index a37c1cabbbabc15da64c3963c76039eb01c76393..88dfcec7c74f199e88d125b7723fae11461175d3 100644 GIT binary patch literal 16 XcmYc;N@ieSU}Bi7(zoEeZ*&|0Bl`sZ literal 16 XcmYc;N@ieSU}Dg#H+bZACv*n@A^im{ diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/globals/metadata.json.gz index ea9fe0803d8aae31bb1cfdac1635277ae32e0413..a02ab1784cf10f1114eeeacf11471a0f16dc1cee 100644 GIT binary patch delta 332 zcmV-S0ki)41^ETAw*h}m&aVnDFWURGB6Pou{*YtI6?<~3nH|HOEN(hk913DA5oFN@ z_1oWB{X%zUyp1t9y@g$j-@XpmZ&df(ZBl3LHz-csuSn0`ug1aXMQ>EgQ0NYgI*`1` zJtJ=eLePm=)S|e7pbc`2+INPGO$gX&=^C8ZH#rilSgInCn!bN-(EU)FULi`S6hPeM{o^eWi{vmfSH)Ek0Tp<#fXXGWO1=6wvyc-Bgw;7R~bl8-bOh`#w%TY(X zimaME2Q&Bv`?i?BzxXist~B9mS&ATJUTXeGNu!{Z1DEm-1zcZUU0#oAzbuM9^_4uT z09Z9-w(dto)XYTS7g>U(@5;m(Z{H>E`|{z2sRHJ@QNp=)iA8c!S|WiGhX<*6*$C3| ey^96_M{MC@9MZaM4AEG}Y5xM}JFHUP3IG6-3!<3- delta 333 zcmV-T0kZ!21^NZBw*h}WTYOb`dC}gd6@mL*^k*C^uGo_k&5S&UaHoo!P8ElG7%K!> zv_SpxcUHg8of$7<3{Ed$7vq<&1NIBmJ$H-LS^EWwQ}-LvbN8EZFnY}!)iTt%L!%BP zuW`?)+kg;sBG$AhZlGs_9HaJ~p<)yIby~Rw=k-mF^eR@WNTYwIuN!nfRHj#G(kV3> zoOh-Y4U{47|Cwi;(w=|F-K))5X9ib@H0Bv~iD`kdECKHZdeUu1q$V9Ur6m(m64!Fn z5w9YvCeOhPKEb{%=I<{)%)Kj3_*#}C2$`3fKT^^tXyw4A{6hiPS67$UW7;o^Vo!Y~ zk17CG4VkU`Q4vKoGx$Z8VCl0mamL$siTk{KxM8Y*xo(tju3ci0oRpSGpv2)pDqc2% fqk|voJ!Mu9u1y2sR%Fw~qt=iwRT1&v2lxR#Mf?Me-2mbM&;SpL zkkM=$d?46x!Be4x>BBCLR`k!%eD$dk5eNmkV01VglTHQ4Q{v&2 zph)0IxNuB56%zs6EmL<5D1@p2nH|vMBjfQwYQbHr!>4+_(*PeVa|-E&#Rtn4((W)m zSk~Re2g~XQ%jQ-U0Twb81`8dmt(eM98F5?~GE|1bK-!ei#lpqT+=4b`lyMCpprAFs zzPRFe7O?Pdj_d1ws^B`y7wPzzT)+D%F{a8H{0Unn z=Xb66a^&)SLSJurXVK{DFN5=z^2N!l|A39z<}Y3SitAMOI4)K8gc?)I@TUyfSliPD zN+5dDW*)9L#(!4t)tG08{7To~F|7w#TaD{$EdVG~^r@D4a;>pkDS^Y`9EdOVky|>s z?pD*@b~qeP0RR*eB|?x8adLw3@Bze#3R%N?Ivq}y*&xe=ay-kibT+gyjq$-vrCc}j zu{P`-S+-mcht;82QC3pb^P*LeW5_YzeiX6ii2G1#}8!8z@r<1o~)F zuo23%lAT7^W+kk2Emp!pLZlFh^jQ%0xsXU7mD7eObYUxtl`<4ITC9{I^l`dC8$yLh zCY2Hh8(~1$N`~4C3uSD0Os?J;Z+T7AJ)V`yl?Z`L1slO|;AP+e$xthc0rbu z>018Yq8X79?O_Li{;1(Vr3^wcGa`}{BuNKA0zw=sS~n6P6E#j_OfrN<$h0s8mF(;W zcL4C|(}7|ma3CKv*Q;ok+WB|r`QtlkL43A5z$!_{7K{!)k}jl_JfUjMk|H9l7O6>CbI4X$G=#FngDlV3qP@Z_oB5H<)}^OZ;724!0$4{J;*qD8@n50 zpDyrx+@Y9gHK<{CU}Ass%w4GER(EK91&! zed5q-B#@pSKcg%@E9-3^v^U1HP#$)CDP7(uY+G68NW3CNmE%%hXeKhDjgv^yAlV`y zWcxdbqN>X%yMTOj7(RUA!-_mGPCxMkLlGvdFP>N}yZEe literal 11896 zcmV-;E{D<2DgXcg0096fwJ-f(0RIhE0U8nNXHziT)uat0sp~Ib$v;KW1OoUiSvGON z7HC0AZ2%QSb@yYd{~LkSCbF@Wv_>iN%79{_+SwU9&v=jdPZee&ySux4d+8zo(1XV? z3}czbaX=ik{gVc526zVW*-qul*=#oveXdBIL!L&%gl&rgZD*;IRBo~j z=o7*;QYQ_uL8_fjB0`ftuTCcssdPFiNcErGrx6GW;bvt~8?&w<6u^?ou#M13LKLyW|N5+= zrj=>eVE+8dAguElX&_tTap&h{r^T_TKq$m;*Wf8`5H zo8{~Kf1YQ5;osL&BK%%-k_r=_m1)7ZuJ}NhmaxM+PKP{3SFP!&%P3ctlcRA?0jYTJH@FGb=`<=DoaEF zA%tX&LD?kY)2ZvTDi^`1P!Qdsvag;8(J9<1AK8^NEJub}6^?A8K-h=!HU0SL=oSx> z@o%aUVMH7xN){yG5sF-v4!*O5}pevbKPnc^S9kM+m8)rqvc%-8^O z_1Ho9_`wn*?Oyf35+mh}@$jJ=CM-=|Kd{6|R>yi^iIMuoZm`5i7Y{5k65bdU`hg`z zqCz)VVx$abFg~!vNEgmvd{_cyNssUUvH0jQ5%wtA;Udl8kY=PAC`_`{WlrHBy#++0?ctI_ALBRq7A`uv4EmF19ifi~_hvV`<#nQ8HHeP)HMOZ3C_ zyJt}UA8uF`*7KENqe(|bVffQ2-YJcf%n+$Wp&ymbSO!UwnH@JIj*}!coFs9v;l!mz z17KWiz_`@t;dlR$5|4Fj2f&KqfniqFEteb^W>uZap8#A|Ci=ZucHHOzNs#`b`s*Ft zqO$8TEMqH%$dJzO-aizUfKKJ?`rV%hxyh)>`gAJWR5f+0B>_K(bPC{uh|X7ERT=XK zVN|7ahP`y82;WpsLow+Lv8nza3zTIiIecLJvi4t?|L2pdsgH2Tx+aVvAnTdBat;3K z(WGrIVWn~bWVG79UDVeI;;TN?=PBWuE$)v=InQCX(8g<)9- zmhGP=79W6c!inLqa9lV%G#jhJaZ`0GhYXV!5f3sJ^;bc5fuoEMkEWsWpFrB9%jJxK4 zSvPs7+~&V*)xy^%_v}+RKO9(9qv@uiLV3b&S||AkS(p{Fs*JO|yR|zy(>S4o{PSxr zUfbYgWuu?JNn3yGB{$V*Np+A5jt&0(*SwU*_pn3)yC9Ky`<}IP?!URkXmx{=mTj8R zHti^!sFpb3)VM+vEYJlG7%7Ou6#1q<4TfKnZE($Vt4`~rBQj7t?Ztzn!LP|tT5CnT zc#zT1nd!scq9k*>se%=_uj9nx)*>f(K~80-kk@eQfZ% z7pHb-a`SnN(-iA&vew+<+p@jwVuQ2Bhg5RP7()$JTy7ti)>+D7ZBkR+=4;uuqf@Jo z&bMEQRaqFGChTf;k`MW&i_k$JiKpY_i#Ol*q@|5!o?CrVHV7xz^mEPa=44r83fuTQ zRlSH3;TjCnWch>Ztxi(G-0%2L`QPa_8}Du0YT5M6v2iD5ob(IF2RKLp0xeuQsgG5y zby5&ad|FlImaJ3OlFq;0liT@XeJ$o^8+2N>IJs}refcsUk@ABXDnLOLl=w@8v5%`Q z53o9R?&6R2xR0*CFjn)yAL~2%-E$T?>h9EPeB1bRvgND4y_7ZoZIaT%3R18G4)C5a z_7P>=Da=Jyw?a5p>w<(#2m6Ye@U&~@rCG;)?>AcMZSO+aX1=z?uNqEPNy-mhP$3Jm zKHT#X{9L3VK^=&Iufy4Hf@h7PJ{Io($rs18Rwo6C5MN5XjFoAbubY)mOJUo!-d1Cl z?QMRW<#0PVBL{n%o-Yb>?1S&EPVzDBnw9UA4q3ZX>(g zKUog;A!~`3RXS;i7aVJer@EE(Q7B%ZjU9Bb8ZEhklLTJkalfLQ43tqe81pNGR*G0D zqKFH$JJ@#%XIB<=9&~WO3x3gG<>Yp%FYe~}-*zYZW{bC8>on|wlVP7#TbPM`_v$11=PNdXS|rcP%5xYAlb4Q`@!Acuo|`vrYi@97+5SG-E_^!It2s85aDg$B zkh9G7g>~$%#nWFa-`^w2#9%yIm1xx2(@qz{>awQI?rPh+EE}AlY_n3yCkrQ#0;$8; z+c$Dw^ z!sqaWDXw%F``B=9bnycd`FU^BL!rT!ejm!bv>w4xM$F) zQ%Srkp@Z;Y8BdhXwp%`{?}IzHJ+*H9TG{xFvpIS1-tI9V1;)DqPSz)y&tj5=$g7_Ifl^0Ye>2S*6ZAY%;J$A(rX z3GqS{XV`%bH0q!Q9dM99242|04RGK=3w!`U4CdfP40sR*6h2g;33Q0U#syrk1r|PR z0Rt_Vp~n|95W)-6K;sA*AOITHfCCAnF$6Qh0Aq?IOdvI+poI^H5QY~)C_)x!d@+O` zyr=;iny5!CFa$Ex!G{ab;EX8TAY+RSh_T}dDR_Z}9CYY_6Jqd?0W3rjg*3{rg)MHN zp+yyL0K$wLbRY*jc)<%~U~!A+=s^s?h(Qx7nkYjIUC`nRI)u{!GsJKQ9zbXV3YGwc zE|yS*3$UmI32s;+h7-7;wdR3oX&%o^E25K)MA^u~Sk0%CJyun@CFaG$BDr9B?ZcAg zd67KW1Ji;%o*59N&?#d~0K*eexZsQ}+#n5D%s>f9Fhdtk;6M-FV22$nAqgMIfEa7= zB1-8b6&+2uA_-G8;SOh@QHfAO6Jv-W1Y?-s3N}p8MHOPC;SNu@!V@ysLJoEip#&=6 zqDC8}kbxC;fWiz!zylg0m;)BFz=al?@PP?ggpq?O(ttr3Ot66yZ}{N`ZV*EYK$szf z82I1{FwDRO4=6HVg)q)wgDlFBIp83I7nA@87?vP}FLY4h4R$C)5!r6SgBBaOLJVd| z0tdhtLKMj0fgYZ?;)xN|pn?op;6R5IQb^+qjs~0%1_*GlfCxYU!VMg3fDR_0q6sN@ z(1QhX;6f25bnwbB9lPgOohSLvHZHks+gaK2v%F3GU!JZ6Qfgd*1tUoDeUyp5<65na zAJ7Vf6u-ngnQ`(G)2Wfpnr)6&zb3D>&0Z^8l{YpDDdd{o6f5-HQYZD8cFoGPYjB2r z)``2)x}SS*>h|2V?Orb1zWlYwq5er;&U8;k#^b`_!GS1zFa~rHh#RnB15>D|nw~L+ zeXz6DNkOtKhc0WvvA&M}7v-dz&S;y>@3yH@*`{=^(fKm0@p7Di0u3yr)bABmn%lzI}d6w0JGOn_= zst64LVNSWb4x_Wl}hAfPo0Az{3r3pn(x~7GX4(WutS2yM;w++x0hA z2ee`x${iKlXUBIJ8dQ>OXVCZ}xMrcSnL zKHKJRx@+cesL;U^fYUD9^E`)L@LlZmQh#Q^B%mh0@o9viw<*Fy_e;Y*AN+v5!rM!~5v1J4#yHlx1#I zTiNV}v+)UKma$?Etijjwo|pJ4PtnUzz+o9rly}eOZEv(ZHx1u5>7Z=ZI@ogER@T`B z=hG?B&YwwHog@Uj)Ja7ob7*dg(di^1a!w}^$sdldTv4M>#<^1>46@PK%IPE_V7D;F zd)C$^a&kIJ<%;{`6B5BAp;6ZTKS|{2#p$FV*N6o&WWwpBAS9el8UnK{j&p?5Nk=kh z;|HgccvzpjDbOBGCk>HqI;ls!(n&=)^hzfQ$v>}jk`Z{Nla4%={oCA&%GK@^CY>~- zsw|OoQV>qrDxG9xltCEl6ir3h&?=oo#HLd^$;cIxh~L(=sStkF67QyTk_zY(x<7E0 z(n&@x3k2HOMS-M~gup`kNhb{fW!H-=km`FD(IQSdNr(u;PC7}2hdz}g5^r>pk@y4J z=p-T(6FrCUv(ZUGZV)s%qx2}llgNrrI`UVxxj!n6P7*TdhcdUcq|r%3SRrplCmoS8 zI%!CB5|Bw`Wmtca$l7?m=%gUgNkm|cywFKO%pwvxX^1Omg-%j=^w!|7GAwVK?>J8A zq#y28HWtTHLMQof6XF|gg|Lj!Nj)IWE(=uQ_@I-3R27wpMj!khbW)J_YOI4!;*oV7 z0!<>QD}%6{Wo~u3Z89)x49F#|JL;lw8-!c-64Se>Mjg8hZnNkt-c zl1i*7n>(kIgg_@1k6h)=dcXp+NIhCk;^{br{Boj@)^3%AKQew|%W{+(FqsJKkKi@!iw_j-}VqDNYlG zwa#3p=JK2>m5*lmrlhmUJLy)flWodMTjZ9< z7$=<8!dYMUxd-c{Qsw4*juQK8iN{dp@MNKlYxDrIsLI+Hr0_v{lxdzJ@g%d3xuhT-<<#l z2+faSD{*|l_a6Q#-{5DRaDbQ(y2wNQvnfoE;1GFFQ`g3xV_0d9>fD3S!J5vT^aq;} zc9Zg`@}@dfZ46@)7paqQTu50sLiY0q9mElaGX%WJ{@oeQkF4XT8F74#!AYIe!FbQu2S(|nBG0*ogcFt~`kKaHbgn4&9_|zk2Ex`wCmHD$Mzg7_zY!RO zRXIwhaz)i(j`91XlZ0Rw85f-72pCVNgJs_?B_6wp)k#Cz2njhtg1$LbKH0aQZgHBn zY4>ESo6?rIo;S{YaV)J)+A-~#6`3itv2I=^ZHaOdg)mAtmBi8 z*Uh%4ecRSlw#XrD^-gob_b|Z0J~mWV9jU?5jc{m0WtW9@Ys>7h&U>dPt#5{owl_`P z9DUiSjJk10H@wlczef8p8;%VWoZyQwYy|D=gF{(_<+M&Rf-$`Jv9QR@bl`i=<<=HH zZB%OAc$BhfIPI1vn14?bla%bSsv1pr-s&VGGUlA)D(}BK)Y3ZtJvedRpr*FjY-OvN z?xtTydGCaMY%r|VNkH;n77eC8R_CvUd6Lh`CmYYuHmYlE_r5pjd;llDe0rupEf~gw zMCWOmThdEoY&pJel3v*+`e2jq^^f%MP*Lt=VuTYcr13dA#-PO-gzz8zUOY(N{jDS2H@erLn{w0E$wu#-+E0_! z(@aW_FpMFLB1|-)&!Q)Qb@qw}?C*R(e2v^UFLb=_^M?zYFbY<6niWaV~m z%2OQFKjDcnZf`~Mk_1l|85i6$sO9oK>79^EPTRYBTg*~6o~PI*d)=j!Cox@XNm`&- z2pxR5Q{mQ@a>`}a&F+}nrBOCH?KgRUPI_aU8a%iH3QABzdBVJB{8(Ky9W)t7{eW?brKLfKWL?sqy@^o zx6Y|fccE>y&fDJRvh6+XMyIn;SAAg_9{q*st(yv2{XHXYU80w8zH+BRFM}x>%*LXr z@(h0RB6WU(SG%8*N7w#J+uWnFy@s-JM{~AR^8f=4wt#|6$K`6{rrhnr>?OAJFMY}`5F?~GEwNW zEO#btE1fUjR^2YQx4k-L%Now+-hF>$jqS$HPi@#7Q&!K z2+;5a350-x2VpoP2T%k-1!%y43S0<=Sg515dN7a_AyY~jGcqzVa@T@ObYxwU%Ag7t_#se9 z|A0Grm=>yZP%7@s_ba~{fq~iq>mqp_*8g>q8FMy5{+oO`(P18)x)}*Ma=<~8XPMpX zm#Pb_qL*;}JW3mUx7b|*TK&v{acH*u+5CEvD&fGc5*qAx>f_@`Z!+G_xSA8AH#WA` zDfJ`6mY}mMGq7iwP1wJ2P?{LqJj}(Q>MlNC%r}LjnjWJx^bX#aG=oOeRMO(nQ#B>* zY>(OK_M|v;P45+b_L`uwxc-7`@8KxQPb2qTF_m@_9v+a|gGkdeP86je4uR?{dXo-?$QP37%U^woj z+<~0K#D#tp%_>(GfS_9ASs5|?@S`JT&y!))G;WM*RVY)=9mOBURi1 z=3f}>U#j_5=lE!Jhaj1Ekb*Y}PJ8w%5F^zU*o{!)`0OcD9u3AyFP>mU41ONnRZr*M zqnBm!4_a6>Hn?Gog9B3OVS$weJ%zjUxgs8mtJc;z-0;ukPPmWNG7yXqVTjNdCa-7{ zCf(SiXfat9A4-SHjq=?D2rP^M=1c}b4c1?10bb7os`ZjH*PZrU_Kwu-oS)0`l7C~B zkWyKiphuUoQBc;}6tZz3GKv*ScEK+jeKNp-7Wz9_s6t_+))FI>13G(yr^DYdRQ(U( zP|P`G*s^fsfDLR{>C3gY=kD=5=6O>r~^}i%SwbxBN$-3)7;1 z-|NH%gsb%C&n$X2G!$LN&cXVux+k$bARJ_H&YoyNlm>OA3|v;D%Mi%w*VoEs*Vx7i zI1n+g;*IiwJvp0n_T+t`9Ut>5)Sv!9AcHV)ekrpwMKBcNMrLQ6Akg-J2qy$Q{q8~< zNpe&?apI%e4mDayjP`Swe9G8>qVS+|uPzJ<9cr)zHusPPr&hK8o6(x3PP{zR~;zFMa(=7wcr_qwZj&CpI`-luL@-*EMNuAI0GFR?FY!i3pTS_1ll3U?f# zMFOOv(K3MBy+b#??APJrh#}*33Z99(3mGOXYQ{(CRDp;fkgcnm21Nj^mT+q{1~#>G z*>XZwI%oSFH}4=eiaBY(KqW6I8lYgWuaK&8Ou&vaC zCS?cA$DWJZH_IzKvIYS+kmeAxj<~#j93>4Y)p-ZIfW)91S)~CMYRCfqofR1GEF#bs z_--Jeq^U^feGAlCiu~K(CCipos|5t=Rlf{@ziMd@_H0)Cm!9BMX$p(_NL&`nSPABA z6G5n>W&P5Eveuhs`9>2%IT!aSt`}E>Q;-u(cZI4zGrI*%VJ6^}fZRg}(lIR^lt!E- zggoFEJC0G;aC`}V={oPP>54GIA9QpSAUtje$6p3I@bi?@p zxQ)D)Br)+;A8=en*~G~aXW;n0Ri`>JK#z>~HK( zs31q19IZ8_J#EmTs8ufkbph`cG1^O{MgN0Slz3HZy0a7GdgiIEY71NlO~vVG8SKyH z#s`3cVzEvU7s%=YI1^$NIgzeB%Jvn-qF-10sS$EIz|rQYgOQYQ^MNBsfe9&L3$aV5 zZcRr?xQEd8XF{Q-WkcL=oCRl}Iv0INo!qTPL}Bhko(7Fcf}gL1N5V>0GE+GR)E+AF zZUa`pA*M})Q#(ZUXfJ{39Q?`@o=TT=n1jyNw%sB}{mX5S@|c#Wt)@^KJRAxydGASi z3sS26DO40@=0}0OAfT%U^Bb9gqRYT$-X9U|sR)8n8pFufI(2C%iz(ota@(or+nUgp z7ghr5oV~3o|uXJt(gnzygvbwjq{NAwkdjx|Wb2Z;_3x z91rM-NU+yD#;lf#3|)?Gy##yFfS;a&*il z2*-#Hknfanv{FMtuWhCFF4WYOPSwi@K|cXKAYVSG9^h~TpM+XjJ%8$7B2?BVlba#0 zM|hm5DE!_vj_yo|PqQmF)exeL)R4|Nw-hO!2XWk>;i(}04%ly?(@Gv$NRpLw-ydXW z&fJNZG2hesC*C56gC)K-h|sTUO0CM{98n5j;Z(u1@7bm8}b9@tSbr3wl;j;Ww5>{zmheeD{`j931zv3hDtx!noY98KClE4{A?GxPl zB#F*uDUBF`&md$$+S?gX5$>Ju`AjNG327mc09zG4xKgk!q2TQ<{p#=h<5ks%5djHZ zFVP|)$N;q1K)TA;7_utN98sS|{W4OJ3R;?KMl?P)KgGtJeyTwO<3>hWS;m_w?PQ zJ-%OBVsB_e7dD@ZooYH&9KW&@-uxlBrPg}2sjFZt3~57IF5PMHPyEvC)RO9vDrR~K z*{~NR3y`xWh^CqYPnS@oAtVIw5b0Ra{i~V=@WGlic!Xa=J+KM^BcRFeIlfNIq`!$= zc;yQ!wq|PtT5B16D=n&XI_b!4bU^_Jz9Ct(0vVDqedXnE-ILl_i*a?M*RI#X=)>nr zU%IilA(-p20+RrfP*=|Rxeb92pP~JXpuuVd(pL&v3)KQQb`J$X&@v}q7su?YL+d?{;$4F;&5}o@0w9=dNpCNu4p*c6w@`itb z*@kXYTpDSTV5XZA_D`xfzNjLg@=5H~@`agju6NwpkJ zI*Z+DVi3V99f?}0MY71t^bgLDe5AvvUc{_4^)F1xaK+!v=cy)=36wN{l6sA?cC?ZW zPJDGK^#NqkL`>M`KJeviK^5ev}=NsxaQM3tY4xrcu#N z0(eJfV%Bzmfr!kwb=3PLBUZa%ss}C@UwWnG?~LStf!^iS?13P>0%`_iSsXK%VeKG0cIZCly4=6kJCZDH5ut9IAvV z`7zk*YNMYHY+E>3+`}RqvYHd1VVfsAX84picyqW`u5=31gMzU?hfqf`4tG>F;t({n z{b;-DXrx~15Sh-SWKh?9qKN;&>iKpf#fGvU7H>oEzV>ND@7TVKwBC;5^99Bpbya)iy6$$FZUjhq?^8)u(2;vdL=H0 zme&Gjza>T@r|$<|n7+Y$S}`MhI3i7BMa1=#Fd@HZ>;FJU!{w9m55d(-FOjRjLVH2k zb=r|qG|jNyC=5^^-q1~hoESenzZD=_E|z^){vP70m&+d~fAzxxen4r72_##E=r%v| zVrepmJy|BNuO8MA~1pqo}!%-3%gswPaFQKv)vQP{}{=c5u^AK(+5#%CbK9-_UIclIh3e zsrU6fVS|rb5XsDh*FX3>(>I3&EQtQX=JP%irjue_+&l&oEi#N|=o#j@d(Lqfd&Ua0 zXDNpk)Z;z42+L76$E^z;+rHNylaPn`?-Jk};Vd>Drv(J*-Y>OGXu>Br`_j7zNK z6P|E3;0LW^DWSLW$WPb?{JP%5{C3Cuop(|K@{ZUo)xOt&cllloF}GNwBm@4Lz`axx6J*2T=#hr@-Y5aWTNOro~U zjiqV(ZL1mr0*g+cr9S6)cY9tWz+1BLASCd2?xU&+!BK{}M9vgygFP5hpys83-+p6^ z`lK&|b(w_vL1~L4woY)ma; zG`;Ni1rU;pVHYbl{Sdsfl}bU6)mA%vh7r)9+v%`r?tu)O|Ed6fH0XYGE)Fe+(wm?3 zdvO5;pfB|2hEZn)&SAjPx?)>up>{aA8aoe6V~U{Gpv**&{Z2`MaYv*(1M~(VsRgJB z%6BmI9;)a4x#K*W3Xc=oDFL}`MI=&xt_ZsJ_E!FO_lg z8B^j_kv#d5B()6-M+W2<6(3W=LP6KWM#l0Q$6{~W^yIo3wO6sSYkyND7d7Z@;2}31 zH219QEM?maUIVeej>kdMt-i7LHRC5=*^DO6O$L#JT!gf_N#hxy9@U_%>rY`YDCtd< z0LDEGY7XH&=Hh8k&nU6l@D{?@QR&GhkKNd(_A0f}y5uTCDrwDnzV>zLql7QoV0|Cp zj22V*q}N=#IE+iaVOTwqrsrTE7~3XYei6BCP1Yf~W}Wd#ZMV#v!U7|6*|BOkb5)mXWoLG4@~u9J z8+)r~)L^u-IF5y4ZADSkHsn*lQD!E;ycd9@s4=l*hD1aH5lNb2=Ac5cDsBT3ps{ce zVkm|J8H!;HGe&>}#3&>fkpvQnj{*KW>9oc~8p4SQoUc@wtVfn?`($OpQvvMkZ_yrH zVvaPuy@_5iF+~hz3Wk%9q3ZUP2fb41o}Dua;$9jU%i$;w*aQp#P$$X(lNpQR0LY3L zua#HJjo13kHNE0M zXaYZ$>52k<%Vq|p=}^uIt|8EvNLiBxnXJ$T{c1saE=RnR1ceQBn!ifbE@P2H`O&5f zxd1&;CQO_II_8gC5cgqVBna;hSC=Y)?Q^2!HY3ecfQT#|oZpt?Gn!DH#Z$j&H3F{f=O5s|pPO00000001bpFa00@0RRAti=?~& diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.index.crc rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.index.crc diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/index b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/index similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/index rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/index diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.idx/metadata.json.gz rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc index 6d825bf424557eb3ac139e6d12e0964c35d5993b..d019d1f08958cbf56ec7602d5c7f8848d3ff6d47 100644 GIT binary patch literal 24 gcmYc;N@ieSU}8A6W9nHk@fRQG?7zNdTjuIp0CI~9rvLx| literal 24 fcmYc;N@ieSU}Dg}=E8V3gPH%fp|Gx-l2{Y~Nx}v2 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/metadata.json.gz index 229f082a38beef9f77f5dd58ea05f2d2a5c1d000..4cff7dfd7fc10dc132a369be4d4520009ac7a332 100644 GIT binary patch literal 1820 zcmV+%2jlo3iwFP!000000Nt7YPun;Y!2g&0bko8PC}UfHE2RuHHY!?ir<;&v5~uY@ z;+*4hFqHrOo)gEh9k-=O`(@KKEzi&Ii~aokN%|b*I3vL@xX5GjNaE|8(^)~H!ypXM zJ$VX---DDzs}i1Znv#^1!7U6$EGCgu42I9Gh51uK;;|k8Eh0rQ@^*59OEMFD6^ZBC z+|Uw5EEjYkWi-e9DJ)n?1!Xx%E^;wAtU~1*1S8Jz(|4tL8}h~A18Pj}W|5aP9=5|J zu^pg>aF(boZp@fx)ge@!5Rf(dMD)(yWjxqr|P+Mpt(rF<E)2OVBEDuOF{Dm*h)B@8N?V*%66&rC|e1ZY*JX6(u6(` zaA_8-vES{eiikRUC96Ykz!&xt-;d$oK+d)hv`M}YOD#QxNHBPsv+?Py^)<#tC!Yz- zMLS}OSXqHcDpo6MWdkHsAsRYGr2&2QL(ILYC`SRtDckpX7j75cMPWjSu4>B zP^!{y^q_pvdJl5cveF7{D^*q9hN^1J@_&RDRjV8yg$seefpEIyXxzFQebfIt)8`>v zvP_?}@#x~T>q|=#-9y19(wy*^8Gh&yU(G0wMXYANH5!w=Y#F4Y}YC8=hrok~9#Xq`&K%KGr$*2oC zrgs}6F($SbEW*@s9+OmHcP*5v_dbIJDAccIWFjTbwpO8JA?UL95v?Ke!t7fK_>y<( z)Ibc{HJi6|$;zh4MrAc(TxuKcv?d)BlP%^ok=N$qZE9XH zR-xHk>_uXih`lVl28GR>91z|Sf&B!$nSHmRua|r`g6~@HHIY{>twp@1OareH+Q7FK zcfGXhz^==>%7YhmJ*2DDJs7%ya~;gpg=JIjM#yy-SDA1T*Gsrcw+38?Z@bG(gRQ>Q zOsqAjwrw36YIj6ht0%qU-1|wkiRwilYXz{~Uk_lltBw5XkgE-@J!WkewMAAhv39U% z@P6vECi*FL2cK);^aVZ@nXX|{yF)4>T?3=rSk!}}9uoC|=td6pVQ8O1 z{Sdl^L2U%>9azu4blV|e(FD&Pa~jC$5T^y47H`_PY0+j6Ha(oV88a=)^Z;g$FI}|U zN|x5??_r!tlzpD;;iOG$229!mHcDFE#UFv<$c`fKR!Ks*^TGFWa&mC^@uV9HgJ?<9 z*ibnEfV_toETuasoY9}4aQy!L;J82SpHjs2)E(!^dmCr&%u4=&Bx?L}a=2(^^%rkW zQn_CsfA{XsT3t%vc1h~*+AsRpZN4%jl$Y`jd~A_iu50%d+j_l#k>RiVP+o-qRwWTS z5QB^?7jSv|-W$41j6AGbmhtcUlA)vO#pFcuo75j1+8n3fpln`OPqjZ@e?;+AP+hQ+ z?SY={&FBl*=e*LdLsI&bWcX~#0u3t*LAKXEs0H|gCA)tAbeM1IHGg++VihlF|! zy4A6*RDm>J!0W~tmA@?xzX$SRZ-4Lor%wdO@!=jhj0by1gJ`ga?=armj}DT-e)4H> z7a!C=MgF+P`GTCll^M(V&p|2QUSwj;50~(tri8(3wB!d6PFupk@L+#`|DfuA+XVk} zP#6WU;{m+&zSPXo;nDHYz_B!*OIzdG7R{q{6_Z&L8zfJ(Y$DxANhQ<4LU8G=s{RL_ Ko8|%?6U?j?B zqiZQTn>5gW-=RcNBt>5V`qH8(s^M@Baya}Xzj-N6h&S-A(~!K7@b3O{mXqMf^F8!L zUcJFLFV2Eh0na#&NlXgw0fvGsB!N)$25*gp`D;$Xks1IkB1ACqVSIrLGUIF&@VCm` zzz_vl%IQMLXo}gZpJxT-G)qBpo$~%s=}X@r7&3-mze&vppDp@ls5ZHuMOM&o&0S;!(u~6nR2G_uR9c7=mY^zrnO>#x z;FA|vCp}jxN~5vBGp-UWeVTH%yEY>cct`HsaGWn;Rf~c^h=#=+^JNB$M+3@V2jfYc z;mTZ`MVt_`lxP9f5TO(=;4fIovY6mhsG^9*kg_A=(l7Fu285ACT_T!2^1=xu6rws) z8lYMi)yn7@Gm2BLJeB@Rq?bb8JY%*kT5y^!z*fZI%%Ch|QPC`w8qHQbi#92&bZJDN z3Aoe?R@?7pR7OOty%N=-HsEvi13!)6;6To%5VT3U;7cXFgh()WnrEZSS>tPj^Hx6N znDJ&r7iW1R9Y51hgxl`YFy^&_bR(ag-X>y!{-S(Y$>fB}t5ZEC|IOyNQ|B0tIWktF z5ujM6UF$*lqV*ocs9~iMm{!WNeDGz}=;i+cEviyEJ@ywIfdg)J$>FGRHT{ z_s~e+?Z2wH_xgvHPl(lCWfzqN)osk-V_HPH5$lxE1amUHw)Vu2;F?Gh^yTwSvz?a0 zXn1*v@+Bi-E1O+U=CPf2$ zF{XAKBGD!`7c9cma2}GFV|y*6s`EaB1Sr(6VPq^M*0z?uU?J$T<`JzR@>=g(5%>~! z>QqAv+BTaHYca%Vf2*or)Dc2k$>LC46Y7dCR*wngu(4VbllTfDgcX=}hoE^7CWc&H zZA~nt56q70SWb^i!Qju5!43Mp5&!=7>29>_pZ~7lx^@AIupkCmYwBme*W_GlZ3Aaz z(MD!Dy(MM?nB{20%kr_tW$htv9!#K=LYIwl*;t0S+;$J^AL zU@Sv3x!8%sHW52ncnu2cIawgQCjz?(cr*KMLtiKPZUo=8+^Zw6Tv~&8b(sQQDKvp^ zE$%vL*MeP}b)^R<>N-eQsyi@r1LsMZHF|IUWBd(KhrEUed7T>m)nF3pN zsp(j&Q*F~a6x8mCv_?;Q$GLZtY#r53AZrA$-CqY_m8*^XYLTl6t{rA=7PUcEC$YA$ zsPTqE#g;qPgQ~@e1}3^Gbq}9w;B*I{l1$eyso5bVk*7D>!k{LCb`Gp#U)t>uu&9G)hdDLmw20FHPJ=g1+%#yj1Dg)c+>Dt9WjX+}!a%4-9kEYzj=jtdl8A%KU~6p>JkR8!IJeLoTh}mLGSSJuvfOfZG!(f zC=5N=aSz^lcQtc-bbNZ;w=9k3!q%v=MbjW&g=AL82FVj88w)o=Qi^mi=S+Aj%l`pS KUl|0O9RL6XmUD{$ diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3.crc rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1 similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-fefbe7e9-2fec-4711-8810-155cf3f03ab3 rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1 From 69e7bf1f0e6daa9548014325c95e9711f2d2676a Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 10:14:47 -0400 Subject: [PATCH 30/55] reformat test --- .../exports/write_new_entries_parquet.py | 29 ++++++++++--------- .../exports/write_new_entries_parquet_test.py | 22 ++++++++++---- .../exports/write_new_variants_parquet.py | 2 +- .../write_new_variants_parquet_test.py | 2 +- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index ca33c1dfd..6d5170fec 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -113,23 +113,25 @@ def create_table(self) -> None: ht = ht.key_by() ht = ht.select( + key=ht.key_, project_guid=project_guid, family_guid=ht.family_entries.family_guid[0], - is_gnomad_gt_5_percent=ht.is_gt_5_percent, - key=ht.key_, - sample_ids=ht.family_samples[ht.family_entries.family_guid[0]], + sample_type=self.sample_type.value, xpos=get_expr_for_xpos(ht.locus), + is_gnomad_gt_5_percent=ht.is_gt_5_percent, filters=ht.filters, - sample_type=self.sample_type.value, - GQ=ht.family_entries.GQ, - AB=ht.family_entries.AB, - DP=ht.family_entries.DP, - GT=ht.family_entries.GT.map( - lambda x: hl.case() - .when(x.is_hom_ref(), 0) - .when(x.is_het(), 1) - .when(x.is_hom_var(), 2) - .default(hl.missing(hl.tint32)), + calls=hl.Struct( + sampleId=ht.family_samples[ht.family_entries.family_guid[0]], + gt=ht.family_entries.GQ, + gq=ht.family_entries.AB, + ab=ht.family_entries.DP, + dp=ht.family_entries.GT.map( + lambda x: hl.case() + .when(x.is_hom_ref(), 0) + .when(x.is_het(), 1) + .when(x.is_hom_var(), 2) + .default(hl.missing(hl.tint32)), + ), ), ) if not unioned_ht: @@ -137,4 +139,3 @@ def create_table(self) -> None: else: unioned_ht = unioned_ht.union(ht) return unioned_ht - diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 372a5cde6..2b72de8d5 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -20,22 +20,34 @@ from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( MockedReferenceDatasetsTestCase, ) +from v03_pipeline.lib.misc.io import import_vcf, remap_pedigree_hash TEST_RUN_ID = 'manual__2024-04-03' TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv' TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv' -TEST_SNV_INDEL_ANNOTATIONS = ( - 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' -) TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' class WriteNewEntriesParquetTest(MockedReferenceDatasetsTestCase): def setUp(self) -> None: super().setUp() - ht = hl.read_table( - TEST_SNV_INDEL_ANNOTATIONS, + mt = import_vcf(TEST_SNV_INDEL_VCF, ReferenceGenome.GRCh38) + ht = mt.rows() + ht = ht.add_index(name='key_') + ht = ht.annotate_globals( + updates={ + hl.Struct( + callset=TEST_SNV_INDEL_VCF, + project_guid='R0113_test_project', + remap_pedigree_hash=remap_pedigree_hash(TEST_PEDIGREE_3_REMAP), + ), + hl.Struct( + callset=TEST_SNV_INDEL_VCF, + project_guid='R0114_project4', + remap_pedigree_hash=remap_pedigree_hash(TEST_PEDIGREE_4_REMAP), + ), + } ) ht.write( variant_annotations_table_path( diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 749745314..2ccf80417 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -74,7 +74,7 @@ def create_table(self) -> None: return ht.select( key_=ht.key_, xpos=ht.xpos, - chrom=ht.locus.contig, + chrom=ht.locus.contig.replace('^chr', ''), pos=ht.locus.position, ref=ht.alleles[0], alt=ht.alleles[1], diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index 90c294b5c..cbc7dcaa8 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -76,7 +76,7 @@ def test_write_new_variants_parquet_test( { 'key': 0, 'xpos': 1000939121, - 'chrom': 'chr1', + 'chrom': '1', 'pos': 939121, 'ref': 'C', 'alt': 'T', From 3e79ff2206fb45ff9f0359a9562b839521ebd10b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 11:12:03 -0400 Subject: [PATCH 31/55] another batch --- .../exports/write_new_entries_parquet.py | 13 +++--- .../exports/write_new_entries_parquet_test.py | 44 ++++++++++++++++--- .../var/test/callsets/1kg_30variants.vcf | 2 +- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 6d5170fec..9bc3d7a99 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -99,11 +99,12 @@ def create_table(self) -> None: self.input()['annotations_table_task'].path, ) ht = ht.join(annotations_ht) + if self.input().get('high_af_variants_table_task'): gnomad_high_af_ht = hl.read_table( self.input()['high_af_variants_table_task'].path, ) - ht = ht.join(gnomad_high_af_ht) + ht = ht.join(gnomad_high_af_ht, 'left') # the family entries ht will contain rows # where at least one family is defined... after explosion, @@ -118,20 +119,20 @@ def create_table(self) -> None: family_guid=ht.family_entries.family_guid[0], sample_type=self.sample_type.value, xpos=get_expr_for_xpos(ht.locus), - is_gnomad_gt_5_percent=ht.is_gt_5_percent, + is_gnomad_gt_5_percent=hl.is_defined(ht.is_gt_5_percent), filters=ht.filters, calls=hl.Struct( sampleId=ht.family_samples[ht.family_entries.family_guid[0]], - gt=ht.family_entries.GQ, - gq=ht.family_entries.AB, - ab=ht.family_entries.DP, - dp=ht.family_entries.GT.map( + gt=ht.family_entries.GT.map( lambda x: hl.case() .when(x.is_hom_ref(), 0) .when(x.is_het(), 1) .when(x.is_hom_var(), 2) .default(hl.missing(hl.tint32)), ), + gq=ht.family_entries.GQ, + ab=ht.family_entries.AB, + dp=ht.family_entries.DP, ), ) if not unioned_ht: diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 2b72de8d5..f36892a26 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -22,12 +22,12 @@ ) from v03_pipeline.lib.misc.io import import_vcf, remap_pedigree_hash -TEST_RUN_ID = 'manual__2024-04-03' - TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv' TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' +TEST_RUN_ID = 'manual__2024-04-03' + class WriteNewEntriesParquetTest(MockedReferenceDatasetsTestCase): def setUp(self) -> None: @@ -81,9 +81,41 @@ def test_write_new_entries_parquet(self): ), ), ) - export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) - self.assertListEqual(list(export_json[0].keys()), ['key', 'transcripts']) + export_json = convert_ndarray_to_list(df.to_dict('records')) self.assertEqual( - export_json[0]['key'], - 0, + export_json[:2], + [ + { + 'key': 2, + 'project_guid': 'R0113_test_project', + 'family_guid': 'abc_1', + 'sample_type': 'WGS', + 'xpos': 1000876499, + 'is_gnomad_gt_5_percent': False, + 'filters': [], + 'calls': { + 'sampleId': ['HG00731_1', 'HG00732_1', 'HG00733_1'], + 'gt': [2, 2, 2], + 'gq': [21, 24, 12], + 'ab': [1.0, 1.0, 1.0], + 'dp': [7, 8, 4], + }, + }, + { + 'key': 3, + 'project_guid': 'R0113_test_project', + 'family_guid': 'abc_1', + 'sample_type': 'WGS', + 'xpos': 1000878314, + 'is_gnomad_gt_5_percent': False, + 'filters': ['VQSRTrancheSNP99.00to99.90'], + 'calls': { + 'sampleId': ['HG00731_1', 'HG00732_1', 'HG00733_1'], + 'gt': [1, 0, 1], + 'gq': [30, 6, 61], + 'ab': [0.3333333432674408, 0.0, 0.6000000238418579], + 'dp': [3, 2, 5], + }, + }, + ], ) diff --git a/v03_pipeline/var/test/callsets/1kg_30variants.vcf b/v03_pipeline/var/test/callsets/1kg_30variants.vcf index c92e9b642..010314986 100644 --- a/v03_pipeline/var/test/callsets/1kg_30variants.vcf +++ b/v03_pipeline/var/test/callsets/1kg_30variants.vcf @@ -128,7 +128,7 @@ 1 871269 . A C 368.47 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-1.74060e+01;DP=351;Dels=0.00000e+00;FS=9.28030e+01;HaplotypeScore=5.13800e-01;InbreedingCoeff=-3.32000e-02;MQ=5.93000e+01;MQ0=0;MQRankSum=2.37300e+00;QD=4.80000e-01;ReadPosRankSum=-1.17980e+01;SNPEFF_AMINO_ACID_CHANGE=R141;SNPEFF_CODON_CHANGE=cgA/cgC;SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_871152_871276;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=-8.16700e-01;culprit=QD;CSQ=C|ENSG00000187634|ENST00000455979|Transcript|upstream_gene_variant|||||||1|3386|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000420190|Transcript|synonymous_variant|512|423|141|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C|||5/7|||ENST00000420190.1:c.423N>C|ENST00000420190.1:c.423N>C(p.%3D)|||||||||||||||POSITION:0.787709497206704||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000268179|ENST00000598827|Transcript|upstream_gene_variant|||||||1|4824|-1|AL645608.1|Clone_based_ensembl_gene||protein_coding|YES|||ENSP00000471152||M0R0C9_HUMAN|UPI0000D61E05||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000437963|Transcript|downstream_gene_variant|||||||1|96|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000393181||Q5SV95_HUMAN&I7FV93_HUMAN|UPI000155D47B||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000478729|Transcript|upstream_gene_variant|||||||1|4457|1|SAMD11|HGNC|28706|processed_transcript||||||||||||||||||||||||||||||||||||||,C|ENSG00000187634|ENST00000342066|Transcript|synonymous_variant|506|423|141|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||5/14|||ENST00000342066.3:c.423N>C|ENST00000342066.3:c.423N>C(p.%3D)|||||||||||||||POSITION:0.206744868035191||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000187634|ENST00000341065|Transcript|synonymous_variant|194|195|65|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||3/12|||ENST00000341065.4:c.194N>C|ENST00000341065.4:c.194N>C(p.%3D)|||||||||||||||POSITION:0.110231769361221||NON_CAN_SPLICE_SURR|||||||,C||ENSR00000528855|RegulatoryFeature|regulatory_region_variant|||||||1||||||regulatory_region|||||||||||||||||||||||||||||||||||||| GT:AD:DP:GQ:PL 0/0:34,0:34:99:0,102,1073 0/0:34,0:34:99:0,102,1064 0/0:37,0:37:99:0,108,1155 0/0:8,3:11:24:0,24,226 0/1:11,4:16:32:32,0,300 0/0:10,0:10:30:0,30,306 0/0:13,0:13:39:0,39,410 0/0:11,0:11:33:0,33,323 0/0:21,3:23:12:0,12,434 0/0:19,0:19:57:0,57,581 0/0:25,4:28:27:0,27,553 0/0:17,1:18:51:0,51,524 0/0:25,0:25:75:0,75,759 0/0:21,0:21:63:0,63,687 0/0:23,4:27:69:0,69,709 0/0:22,2:24:60:0,60,562 1 874734 rs145967298 C T 2645.29 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-7.95400e+00;DB;DP=484;Dels=0.00000e+00;FS=7.13100e+00;HaplotypeScore=3.21100e-01;InbreedingCoeff=-6.80000e-03;MQ=5.85800e+01;MQ0=1;MQRankSum=7.38500e+00;QD=1.23600e+01;ReadPosRankSum=-7.54000e-01;SNPEFF_AMINO_ACID_CHANGE=S200;SNPEFF_CODON_CHANGE=agC/agT;SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_874655_874840;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=5.39000e+00;culprit=InbreedingCoeff;CSQ=T|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs145967298|1|63|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000187634|ENST00000342066|Transcript|synonymous_variant|683|600|200|S|agC/agT|rs145967298|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||7/14|||ENST00000342066.3:c.600N>T|ENST00000342066.3:c.600N>T(p.%3D)|T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||POSITION:0.293255131964809||NON_CAN_SPLICE_SURR|||||||,T|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs145967298|1|4850|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000187634|ENST00000464948|Transcript|upstream_gene_variant||||||rs145967298|1|2812|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000187634|ENST00000466827|Transcript|upstream_gene_variant||||||rs145967298|1|2749|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000187634|ENST00000474461|Transcript|upstream_gene_variant||||||rs145967298|1|1722|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs145967298|1|4850|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000187634|ENST00000455979|Transcript|synonymous_variant|80|81|27|S|agC/agT|rs145967298|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479|||1/7|||ENST00000455979.1:c.80N>T|ENST00000455979.1:c.80N>T(p.%3D)|T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||POSITION:0.0498461538461538||NON_CAN_SPLICE_SURR|||||||,T|ENSG00000187634|ENST00000437963|Transcript|downstream_gene_variant||||||rs145967298|1|3561|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000393181||Q5SV95_HUMAN&I7FV93_HUMAN|UPI000155D47B||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000187634|ENST00000478729|Transcript|upstream_gene_variant||||||rs145967298|1|992|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs145967298|1|4851|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628|||||||||||||||||,T|ENSG00000187634|ENST00000341065|Transcript|synonymous_variant|371|372|124|S|agC/agT|rs145967298|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||5/12|||ENST00000341065.4:c.371N>T|ENST00000341065.4:c.371N>T(p.%3D)|T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||POSITION:0.210288298473714||NON_CAN_SPLICE_SURR|||||||,T||ENSR00001516735|RegulatoryFeature|regulatory_region_variant||||||rs145967298|1||||||regulatory_region|||||||||||||||T:0.0009||T:0.01|||T:0.000227|T:0.001628||||||||||||||||| GT:AD:DP:GQ:PL 0/0:37,0:37:99:0,111,1368 0/0:24,0:24:66:0,66,870 0/0:32,0:32:96:0,96,1183 0/0:8,0:8:24:0,24,245 0/0:8,0:8:24:0,24,267 0/0:9,0:9:27:0,27,318 0/0:26,0:26:75:0,75,879 0/0:31,0:31:93:0,93,1120 0/0:39,0:39:99:0,117,1375 0/0:56,0:56:99:0,169,2023 0/1:21,22:42:99:519,0,613 0/0:29,0:29:87:0,87,1048 0/0:43,0:43:99:0,129,1556 0/0:20,0:20:60:0,60,741 0/0:45,0:45:99:0,135,1586 0/0:35,0:35:99:0,105,1239 1 876499 rs4372192 A G 212847.01 PASS AC=31;AF=9.69000e-01;AN=32;BaseQRankSum=2.42570e+01;DB;DP=122;Dels=1.00000e-02;FS=9.96800e+00;HaplotypeScore=2.51500e-01;InbreedingCoeff=4.11000e-02;MQ=5.91300e+01;MQ0=0;MQRankSum=7.64000e-01;QD=3.03200e+01;ReadPosRankSum=2.13900e+00;SNPEFF_EFFECT=DOWNSTREAM;SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=processed_transcript;SNPEFF_GENE_NAME=NOC2L;SNPEFF_IMPACT=MODIFIER;SNPEFF_TRANSCRIPT_ID=ENST00000327044;VQSLOD=4.70000e+00;culprit=MQ;CSQ=G|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs4372192|1|1828|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||||EXON_INTRON_UNDEF&ANC_ALLELE|||||||,G|ENSG00000188976|ENST00000496938|Transcript|downstream_gene_variant||||||rs4372192|1|4200|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000342066|Transcript|intron_variant||||||rs4372192|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04||||7/13||ENST00000342066.3:c.707-25N>G||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||INTRON_SIZE:1683||NON_CAN_SPLICE&ANC_ALLELE|||||||,G|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs4372192|1|3085|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||||EXON_INTRON_UNDEF&ANC_ALLELE|||||||,G|ENSG00000187634|ENST00000464948|Transcript|upstream_gene_variant||||||rs4372192|1|1047|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000466827|Transcript|upstream_gene_variant||||||rs4372192|1|984|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000474461|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|44|||||rs4372192|1||1|SAMD11|HGNC|28706|retained_intron||||||||||1/4|||ENST00000474461.1:n.44N>G||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000455979|Transcript|intron_variant||||||rs4372192|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479||||1/6||ENST00000455979.1:c.187-25N>G||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||INTRON_SIZE:1683||NON_CAN_SPLICE&ANC_ALLELE|||||||,G|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs4372192|1|3085|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000478729|Transcript|intron_variant&non_coding_transcript_variant||||||rs4372192|1||1|SAMD11|HGNC|28706|processed_transcript|||||||||||1/2||ENST00000478729.1:n.118-25N>G||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs4372192|1|3086|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676|||||||||||||||||,G|ENSG00000187634|ENST00000341065|Transcript|intron_variant||||||rs4372192|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A||||5/11||ENST00000341065.4:c.430-25N>G||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||INTRON_SIZE:1731||NON_CAN_SPLICE&ANC_ALLELE|||||||,G||ENSR00000528857|RegulatoryFeature|regulatory_region_variant||||||rs4372192|1||||||regulatory_region|||||||||||||||A:0.0822|G:0.86|G:0.92|G:0.92|G:0.95|G:0.883310|G:0.937676||||||||||||||||| GT:AD:DP:GQ:PL 1/1:0,7:7:21:253,21,0 1/1:0,8:8:24:296,24,0 1/1:0,4:4:12:148,12,0 1/1:0,5:5:15:184,15,0 1/1:0,1:1:3:39,3,0 1/1:0,5:5:15:176,15,0 1/1:0,9:9:27:316,27,0 1/1:0,11:11:33:397,33,0 1/1:0,7:7:18:212,18,0 1/1:0,11:11:27:369,27,0 1/1:0,13:13:39:492,39,0 0/1:7,9:15:99:235,0,158 1/1:0,7:7:18:231,18,0 1/1:0,4:4:12:154,12,0 1/1:0,8:8:24:296,24,0 1/1:0,7:7:21:270,21,0 -1 878314 rs142558220 G C 2808.55 PASS AC=3;AF=9.40000e-02;AN=32;BaseQRankSum=1.88800e+00;DB;DP=117;Dels=0.00000e+00;FS=1.06220e+01;HaplotypeScore=2.21900e-01;InbreedingCoeff=3.64000e-02;MQ=5.94000e+01;MQ0=1;MQRankSum=1.65200e+00;QD=9.46000e+00;ReadPosRankSum=-3.88300e+00;SNPEFF_AMINO_ACID_CHANGE=G480;SNPEFF_CODON_CHANGE=ggG/ggC;SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_877939_878438;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=6.85000e+00;culprit=QD;CSQ=C|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|3643|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000188976|ENST00000496938|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|2385|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000342066|Transcript|synonymous_variant|1523|1440|480|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||11/14|||ENST00000342066.3:c.1440N>C|ENST00000342066.3:c.1440N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.703812316715543||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1270|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000464948|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|42|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000466827|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|132|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000474461|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|802|||||rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|retained_intron||||||||||4/4|||ENST00000474461.1:n.802N>C||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000455979|Transcript|synonymous_variant|920|921|307|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479|||5/7|||ENST00000455979.1:c.920N>C|ENST00000455979.1:c.920N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.566769230769231||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1270|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000478729|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|761|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1271|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000341065|Transcript|synonymous_variant|1163|1164|388|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||9/12|||ENST00000341065.4:c.1163N>C|ENST00000341065.4:c.1163N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.65799886941775||NON_CAN_SPLICE_SURR|||||||,C||ENSR00000528857|RegulatoryFeature|regulatory_region_variant||||||rs142558220&COSM426784|1||||||regulatory_region|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||||||||| GT:AD:DP:GQ:PL 0/1:2,1:3:30:30,0,67 0/0:2,0:2:6:0,6,82 0/1:2,3:5:61:93,0,61 0/0:2,0:2:6:0,6,72 0/0:1,0:1:3:0,3,37 0/1:2,3:5:54:93,0,54 0/0:16,0:16:45:0,45,560 0/0:5,0:5:15:0,15,188 0/0:12,0:12:33:0,33,414 0/0:10,0:10:30:0,30,393 0/0:16,0:16:48:0,48,617 0/0:5,0:5:15:0,15,181 0/0:5,0:5:15:0,15,193 0/0:13,0:13:36:0,36,439 0/0:9,0:9:24:0,24,317 0/0:8,0:8:21:0,21,276 +1 878314 rs142558220 G C 2808.55 VQSRTrancheSNP99.00to99.90 AC=3;AF=9.40000e-02;AN=32;BaseQRankSum=1.88800e+00;DB;DP=117;Dels=0.00000e+00;FS=1.06220e+01;HaplotypeScore=2.21900e-01;InbreedingCoeff=3.64000e-02;MQ=5.94000e+01;MQ0=1;MQRankSum=1.65200e+00;QD=9.46000e+00;ReadPosRankSum=-3.88300e+00;SNPEFF_AMINO_ACID_CHANGE=G480;SNPEFF_CODON_CHANGE=ggG/ggC;SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_877939_878438;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=6.85000e+00;culprit=QD;CSQ=C|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|3643|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000188976|ENST00000496938|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|2385|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000342066|Transcript|synonymous_variant|1523|1440|480|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||11/14|||ENST00000342066.3:c.1440N>C|ENST00000342066.3:c.1440N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.703812316715543||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1270|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000464948|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|42|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000466827|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|132|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000474461|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|802|||||rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|retained_intron||||||||||4/4|||ENST00000474461.1:n.802N>C||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000455979|Transcript|synonymous_variant|920|921|307|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479|||5/7|||ENST00000455979.1:c.920N>C|ENST00000455979.1:c.920N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.566769230769231||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1270|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000478729|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|761|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs142558220&COSM426784|1|1271|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1|||||||||||||||,C|ENSG00000187634|ENST00000341065|Transcript|synonymous_variant|1163|1164|388|G|ggG/ggC|rs142558220&COSM426784|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||9/12|||ENST00000341065.4:c.1163N>C|ENST00000341065.4:c.1163N>C(p.%3D)|C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||POSITION:0.65799886941775||NON_CAN_SPLICE_SURR|||||||,C||ENSR00000528857|RegulatoryFeature|regulatory_region_variant||||||rs142558220&COSM426784|1||||||regulatory_region|||||||||||||||C:0.0482|C:0.01|C:0.06||C:0.10|C:0.013449|C:0.066877||0&1||||||||||||||| GT:AD:DP:GQ:PL 0/1:2,1:3:30:30,0,67 0/0:2,0:2:6:0,6,82 0/1:2,3:5:61:93,0,61 0/0:2,0:2:6:0,6,72 0/0:1,0:1:3:0,3,37 0/1:2,3:5:54:93,0,54 0/0:16,0:16:45:0,45,560 0/0:5,0:5:15:0,15,188 0/0:12,0:12:33:0,33,414 0/0:10,0:10:30:0,30,393 0/0:16,0:16:48:0,48,617 0/0:5,0:5:15:0,15,181 0/0:5,0:5:15:0,15,193 0/0:13,0:13:36:0,36,439 0/0:9,0:9:24:0,24,317 0/0:8,0:8:21:0,21,276 1 878809 rs191952374 C T 1761.84 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-7.95300e+00;DB;DP=453;Dels=0.00000e+00;FS=1.68300e+00;HaplotypeScore=2.28500e-01;InbreedingCoeff=-3.10000e-03;MQ=5.91000e+01;MQ0=0;MQRankSum=1.29700e+00;QD=1.36600e+01;ReadPosRankSum=-8.52000e-01;SNPEFF_EFFECT=DOWNSTREAM;SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=processed_transcript;SNPEFF_GENE_NAME=NOC2L;SNPEFF_IMPACT=MODIFIER;SNPEFF_TRANSCRIPT_ID=ENST00000327044;VQSLOD=9.25000e+00;culprit=InbreedingCoeff;CSQ=T|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs191952374|1|4138|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000188976|ENST00000496938|Transcript|downstream_gene_variant||||||rs191952374|1|1890|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000342066|Transcript|intron_variant||||||rs191952374|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04||||12/13||ENST00000342066.3:c.1689+52N>T||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||INTRON_SIZE:320||NON_CAN_SPLICE|||||||,T|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs191952374|1|775|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000187634|ENST00000464948|Transcript|downstream_gene_variant||||||rs191952374|1|537|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000466827|Transcript|downstream_gene_variant||||||rs191952374|1|627|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000474461|Transcript|downstream_gene_variant||||||rs191952374|1|435|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000455979|Transcript|intron_variant||||||rs191952374|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479||||6/6||ENST00000455979.1:c.1169+52N>T||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||INTRON_SIZE:320||NON_CAN_SPLICE|||||||,T|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs191952374|1|775|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000478729|Transcript|downstream_gene_variant||||||rs191952374|1|1256|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs191952374|1|776|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636|||||||||||||||||,T|ENSG00000187634|ENST00000341065|Transcript|intron_variant||||||rs191952374|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A||||10/11||ENST00000341065.4:c.1412+52N>T||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||INTRON_SIZE:320||NON_CAN_SPLICE|||||||,T||ENSR00000528858|RegulatoryFeature|regulatory_region_variant||||||rs191952374|1||||||regulatory_region|||||||||||||||T:0.0046||T:0.01||T:0.01|T:0.000398|T:0.008636||||||||||||||||| GT:AD:DP:GQ:PL 0/0:35,0:35:99:0,105,1323 0/0:24,0:24:72:0,72,960 0/0:31,0:31:93:0,93,1172 0/0:10,0:10:30:0,30,376 0/0:10,0:10:30:0,30,370 0/0:12,0:12:36:0,36,453 0/0:35,0:35:99:0,105,1376 0/0:33,0:33:99:0,99,1273 0/0:31,0:31:93:0,93,1240 0/0:33,0:33:96:0,96,1192 0/0:33,0:33:99:0,99,1281 0/0:25,0:25:75:0,75,983 0/1:18,16:33:99:488,0,504 0/0:33,0:33:99:0,99,1273 0/0:44,0:44:99:0,132,1698 0/0:31,0:31:90:0,90,1198 1 879576 rs115979567 C T 18648.64 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-2.26830e+01;DB;DP=839;Dels=0.00000e+00;FS=2.49350e+01;HaplotypeScore=4.12600e-01;InbreedingCoeff=-2.18000e-02;MQ=5.92100e+01;MQ0=1;MQRankSum=1.02100e+00;QD=1.36100e+01;ReadPosRankSum=7.91400e+00;SNPEFF_EFFECT=DOWNSTREAM;SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=processed_transcript;SNPEFF_GENE_NAME=NOC2L;SNPEFF_IMPACT=MODIFIER;SNPEFF_TRANSCRIPT_ID=ENST00000327044;VQSLOD=4.39000e+00;culprit=InbreedingCoeff;CSQ=T|ENSG00000187634|ENST00000420190|Transcript|downstream_gene_variant||||||rs115979567|1|4905|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000188976|ENST00000496938|Transcript|downstream_gene_variant||||||rs115979567|1|1123|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000342066|Transcript|3_prime_UTR_variant|2172|||||rs115979567|1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||14/14|||ENST00000342066.3:c.*43N>T||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120||||||||||NON_CAN_SPLICE_SURR|||||||,T|ENSG00000188976|ENST00000327044|Transcript|downstream_gene_variant||||||rs115979567|1|8|-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120||||||||||EXON_INTRON_UNDEF|||||||,T|ENSG00000187634|ENST00000464948|Transcript|downstream_gene_variant||||||rs115979567|1|1304|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000466827|Transcript|downstream_gene_variant||||||rs115979567|1|1394|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000474461|Transcript|downstream_gene_variant||||||rs115979567|1|1202|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000455979|Transcript|3_prime_UTR_variant|1668|||||rs115979567|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479|||7/7|||ENST00000455979.1:c.*43N>T||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120||||||||||NON_CAN_SPLICE_SURR|||||||,T|ENSG00000188976|ENST00000483767|Transcript|downstream_gene_variant||||||rs115979567|1|8|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000478729|Transcript|downstream_gene_variant||||||rs115979567|1|2023|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000188976|ENST00000477976|Transcript|downstream_gene_variant||||||rs115979567|1|9|-1|NOC2L|HGNC|24517|retained_intron|||||||||||||||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120|||||||||||||||||,T|ENSG00000187634|ENST00000341065|Transcript|3_prime_UTR_variant|1812|||||rs115979567|1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||12/12|||ENST00000341065.4:c.*43N>T||T:0.0266|T:0.11|T:0.01||T:0.0013|T:0.061404|T:0.000120||||||||||NON_CAN_SPLICE_SURR||||||| GT:AD:DP:GQ:PL 0/0:66,0:66:99:0,199,2441 0/1:25,28:51:99:767,0,834 0/0:55,0:55:99:0,166,2079 0/0:11,0:11:33:0,33,392 0/0:15,0:15:45:0,45,554 0/0:17,0:17:51:0,51,622 0/0:50,0:50:99:0,150,1966 0/0:65,0:65:99:0,196,2507 0/0:60,0:60:99:0,181,2268 0/0:70,0:70:99:0,211,2589 0/0:64,0:64:99:0,193,2452 0/0:65,0:65:99:0,196,2507 0/0:56,0:56:99:0,169,2117 0/0:60,0:60:99:0,181,2359 0/0:79,0:79:99:0,238,2986 0/0:55,0:55:99:0,165,2099 1 881070 rs41285794 G A 20769.13 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-9.52200e+00;DB;DP=1185;Dels=0.00000e+00;FS=1.15500e+01;HaplotypeScore=1.69500e-01;InbreedingCoeff=-1.83000e-02;MQ=5.90700e+01;MQ0=0;MQRankSum=1.85940e+01;QD=1.31500e+01;ReadPosRankSum=-8.90700e+00;SNPEFF_EFFECT=DOWNSTREAM;SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=MODIFIER;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=4.22000e+00;culprit=InbreedingCoeff;CSQ=A|ENSG00000188976|ENST00000496938|Transcript|upstream_gene_variant||||||rs41285794|1|128|-1|NOC2L|HGNC|24517|processed_transcript|||||||||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000342066|Transcript|downstream_gene_variant||||||rs41285794|1|1115|1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651||||||||||EXON_INTRON_UNDEF|||||||,A|ENSG00000188976|ENST00000327044|Transcript|intron_variant||||||rs41285794|1||-1|NOC2L|HGNC|24517|protein_coding|YES||CCDS3.1|ENSP00000317992|NOC2L_HUMAN||UPI000041820C||||16/18||ENST00000327044.6:c.1918-37N>T||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651||||||||INTRON_SIZE:519||NON_CAN_SPLICE|||||||,A|ENSG00000187634|ENST00000464948|Transcript|downstream_gene_variant||||||rs41285794|1|2798|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000466827|Transcript|downstream_gene_variant||||||rs41285794|1|2888|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000474461|Transcript|downstream_gene_variant||||||rs41285794|1|2696|1|SAMD11|HGNC|28706|retained_intron|||||||||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000455979|Transcript|downstream_gene_variant||||||rs41285794|1|1431|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651||||||||||EXON_INTRON_UNDEF|||||||,A|ENSG00000188976|ENST00000483767|Transcript|intron_variant&non_coding_transcript_variant||||||rs41285794|1||-1|NOC2L|HGNC|24517|retained_intron|||||||||||2/4||ENST00000483767.1:n.774-37N>T||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000478729|Transcript|downstream_gene_variant||||||rs41285794|1|3517|1|SAMD11|HGNC|28706|processed_transcript|||||||||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000188976|ENST00000477976|Transcript|intron_variant&non_coding_transcript_variant||||||rs41285794|1||-1|NOC2L|HGNC|24517|retained_intron|||||||||||14/16||ENST00000477976.1:n.3365-37N>T||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651|||||||||||||||||,A|ENSG00000187634|ENST00000341065|Transcript|downstream_gene_variant||||||rs41285794|1|1115|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A||||||||A:0.0051|A:0.0020|A:0.01||A:0.01|A:0.002497|A:0.009651||||||||||EXON_INTRON_UNDEF||||||| GT:AD:DP:GQ:PL 0/0:78,0:78:99:0,235,2947 0/0:66,0:66:99:0,199,2595 0/0:59,0:59:99:0,178,2182 0/0:37,0:37:99:0,111,1365 0/0:36,0:36:99:0,108,1331 0/0:33,0:33:99:0,99,1192 0/0:69,0:69:99:0,208,2662 0/0:88,0:88:99:0,265,3395 0/0:98,0:98:99:0,295,3781 0/1:36,31:64:99:982,0,1021 0/0:86,0:86:99:0,259,3318 0/0:93,0:93:99:0,280,3588 0/0:92,0:92:99:0,277,3549 0/0:102,0:102:99:0,307,3935 0/0:89,0:89:99:0,268,3433 0/0:95,0:95:99:0,286,3665 From 51818484f0eb429e518746fee886927b20027933 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 11:17:07 -0400 Subject: [PATCH 32/55] add filters --- v03_pipeline/lib/tasks/delete_project_family_tables_test.py | 2 +- v03_pipeline/lib/tasks/update_project_table_test.py | 2 +- .../tasks/update_project_table_with_deleted_families_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index 585aef160..b99a6a052 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -62,7 +62,7 @@ def setUp(self) -> None: reference_genome='GRCh38', ), 'alleles': ['G', 'C'], - 'filters': set(), + 'filters': set(['VQSRTrancheSNP99.00to99.90']), 'family_entries': [ [ hl.Struct( diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 24f28ed9f..9108f71ae 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -97,7 +97,7 @@ def test_update_project_table_task(self) -> None: reference_genome='GRCh38', ), alleles=['G', 'C'], - filters=set(), + filters=set(['VQSRTrancheSNP99.00to99.90']), family_entries=[ [ hl.Struct( diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py index 72c475d3f..604aa1044 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py @@ -60,7 +60,7 @@ def setUp(self) -> None: reference_genome='GRCh38', ), 'alleles': ['G', 'C'], - 'filters': set(), + 'filters': set(['VQSRTrancheSNP99.00to99.90']), 'family_entries': [ [ hl.Struct( From 84e847217ce828f20c7195d647ef920c5dd97bde Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 11:31:57 -0400 Subject: [PATCH 33/55] ruff --- v03_pipeline/lib/tasks/delete_project_family_tables_test.py | 2 +- v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py | 5 +---- .../lib/tasks/exports/write_new_entries_parquet_test.py | 4 ++-- v03_pipeline/lib/tasks/update_project_table_test.py | 2 +- .../tasks/update_project_table_with_deleted_families_test.py | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index b99a6a052..4616814a4 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -62,7 +62,7 @@ def setUp(self) -> None: reference_genome='GRCh38', ), 'alleles': ['G', 'C'], - 'filters': set(['VQSRTrancheSNP99.00to99.90']), + 'filters': {'VQSRTrancheSNP99.00to99.90'}, 'family_entries': [ [ hl.Struct( diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 9bc3d7a99..6ac897125 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -135,8 +135,5 @@ def create_table(self) -> None: dp=ht.family_entries.DP, ), ) - if not unioned_ht: - unioned_ht = ht - else: - unioned_ht = unioned_ht.union(ht) + unioned_ht = unioned_ht.union(ht) if unioned_ht else ht return unioned_ht diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index f36892a26..b37664242 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -4,6 +4,7 @@ import luigi.worker import pandas as pd +from v03_pipeline.lib.misc.io import import_vcf, remap_pedigree_hash from v03_pipeline.lib.model import ( DatasetType, ReferenceGenome, @@ -20,7 +21,6 @@ from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( MockedReferenceDatasetsTestCase, ) -from v03_pipeline.lib.misc.io import import_vcf, remap_pedigree_hash TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv' TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv' @@ -47,7 +47,7 @@ def setUp(self) -> None: project_guid='R0114_project4', remap_pedigree_hash=remap_pedigree_hash(TEST_PEDIGREE_4_REMAP), ), - } + }, ) ht.write( variant_annotations_table_path( diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 9108f71ae..a45d2badf 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -97,7 +97,7 @@ def test_update_project_table_task(self) -> None: reference_genome='GRCh38', ), alleles=['G', 'C'], - filters=set(['VQSRTrancheSNP99.00to99.90']), + filters={'VQSRTrancheSNP99.00to99.90'}, family_entries=[ [ hl.Struct( diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py index 604aa1044..b24e18938 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py @@ -60,7 +60,7 @@ def setUp(self) -> None: reference_genome='GRCh38', ), 'alleles': ['G', 'C'], - 'filters': set(['VQSRTrancheSNP99.00to99.90']), + 'filters': {'VQSRTrancheSNP99.00to99.90'}, 'family_entries': [ [ hl.Struct( From 7985be89fb52526e00ecafe630f97222c3fc77a6 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 12:54:26 -0400 Subject: [PATCH 34/55] handle hgmd edge case --- v03_pipeline/lib/model/dataset_type.py | 4 ++-- v03_pipeline/lib/tasks/exports/misc.py | 12 ++++++++++++ v03_pipeline/lib/tasks/exports/misc_test.py | 6 +++--- .../write_new_variants_parquet_test.py | 4 ++-- .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 12 -> 12 bytes .../SNV_INDEL/annotations.ht/README.txt | 2 +- .../.index.crc | Bin 12 -> 0 bytes .../index | Bin 87 -> 0 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin .../index | Bin 0 -> 87 bytes .../metadata.json.gz | Bin .../annotations.ht/rows/.metadata.json.gz.crc | Bin 24 -> 24 bytes .../annotations.ht/rows/metadata.json.gz | Bin 1820 -> 1817 bytes ...0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc | Bin 16 -> 0 bytes ...0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.crc | Bin 0 -> 16 bytes ...art-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1 | Bin 929 -> 0 bytes ...art-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc | Bin 0 -> 939 bytes 18 files changed, 20 insertions(+), 8 deletions(-) delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.index.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/index create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx => part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx}/.metadata.json.gz.crc (100%) create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/index rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx => part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx}/metadata.json.gz (100%) delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1 create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 6ae6aed55..0b421a138 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -416,8 +416,8 @@ def export_parquet_filterable_transcripts_fields( if self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38: fields = { **fields, - 'alphamissense': 'alphamissense.pathogenicity', - 'extended_intronic_splice_region_variant': 'spliceregion.extended_intronic_splice_region_variant', + 'alphamissensePathogenicity': 'alphamissense.pathogenicity', + 'extendedIntronicSpliceRegionVariant': 'spliceregion.extended_intronic_splice_region_variant', 'fiveutrConsequence': 'utrannotator.fiveutrConsequence', } # Parquet export expects all fields sorted alphabetically diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 051db90bf..7321b3647 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -175,6 +175,18 @@ def unmap_reference_dataset_annotation_enums( ), }, ) + + # Explicit hgmd "class" edge case: + if hasattr(ht, ReferenceDataset.hgmd.value) and ReferenceDataset.hgmd in reference_datasets: + ht = ht.annotate( + **{ + ReferenceDataset.hgmd.value: ht[ + ReferenceDataset.hgmd.value + ].annotate( + class_=ht[ReferenceDataset.hgmd.value]['class'] + ).drop('class') + } + ) return ht.annotate_globals(enums=ht.globals.enums.drop(*unmapped_annotation_name)) diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index db2be3c2d..5c98637b6 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -109,7 +109,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: ), ], gnomad_non_coding_constraint=hl.Struct(z_score=None), - hgmd=None, + hgmd=hl.Struct(accession='abcdefg', class_id=3), gnomad_exomes=hl.Struct( AF=0.0006690866430290043, AN=1440770, @@ -212,7 +212,7 @@ def test_unmap_reference_dataset_annotation_enums(self) -> None: CAID='CA502654', check_ref=False, gnomad_non_coding_constraint=hl.Struct(z_score=None), - hgmd=None, + hgmd=hl.Struct(accession='abcdefg', class_='DFP'), gnomad_exomes=hl.Struct( AF=0.0006690866430290043, AN=1440770, @@ -315,7 +315,7 @@ def test_camelcase_array_structexpression_fields(self) -> None: CAID='CA502654', check_ref=False, gnomad_non_coding_constraint=hl.Struct(z_score=None), - hgmd=None, + hgmd=hl.Struct(accession='abcdefg', class_='DFP'), gnomad_exomes=hl.Struct( AF=0.0006690866430290043, AN=1440770, diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index cbc7dcaa8..ffaeda122 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -151,10 +151,10 @@ def test_write_new_variants_parquet_test( ], 'sortedTranscriptConsequences': [ { - 'alphamissense': None, + 'alphamissensePathogenicity': None, 'canonical': 1.0, 'consequenceTerms': ['missense_variant'], - 'extended_intronic_splice_region_variant': False, + 'extendedIntronicSpliceRegionVariant': False, 'fiveutrConsequence': None, 'geneId': 'ENSG00000187634', }, diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc index 73363121b99513464789a5b6ef363643c72a5c0c..929848c8f70020fdfb19c695a3d4b60115577883 100644 GIT binary patch literal 12 TcmYc;N@ieSU}D%J7R3wz5lI4K literal 12 TcmYc;N@ieSU}9MQe8*A%6y^jb diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt index 81b15aae7..9d34ddd04 100644 --- a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt +++ b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2025/04/24 00:34:36 \ No newline at end of file + Created at 2025/04/24 12:35:20 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/.index.crc deleted file mode 100644 index 812242553eab389da26ce5f3d0090e92c85b3cc2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}AVWDX0nn6X*ke diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/index b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-599eadd2-e2d3-43c3-abaa-0c1f30f93bd1.idx/index deleted file mode 100644 index 60c06ecd3401279de49c80fc184866f61b80c372..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 87 zcmdOAU|>)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJEP-&E(M?(VVD-- MnG6h!KwW5h01ST;Qvd(} diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..3704b42b8edd6ae577ebfa629a5a2f055a90eb14 GIT binary patch literal 12 TcmYc;N@ieSU}89XB)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJELOtNgTU6=h;wG)%9LH&bri~FB*~JzC!AO+N zRuZ)oolP3(-*+fc6iLx`9H1{PilQ2h=E4~bkL2ed#~BHR!DSwkM-t!Mp3Mpp9R^{5 zevqeN_$^3Tv?}2mrzuHE8Qj5A#9|Uj#bEf{T9`i-Bp&Mp&>~UdSCM$G z%?&M4#BxCwQbu#kpTdHbR8W?K>+DeL-@9 z46n<@Lm63Dg}yREG49APSSVp9B4NvlbXB4#rOLfpx~EJ)1!@azL^>@b2+vR*ze=xG zxeLi7?30-*m95cQ5KQO1iT}EZa6KLu&ZTBAVkYzfyI)+=Fy0Xr{Uy0 zWwLXensE09y%%X9h9Gld|0^y~fI4(O3PybNxJo zOP1*;Z9KX>>*msuL_eTm6KPLvwVHU)#j8|y>Jv%`7AsD+PQKR-6_2AXdDsyZPsRjI zUL49x9NQKxpNpJjI7TqE<#{HbENKuPUwGE-t^bM2h;d@fp5-EoZ5i~)<4XALY(=k#;EG3{$!5t^D-%{Sf`w3Sdh`BQU#8=pXeFlL-y^Q4 z^BkA*j4Gsx(Ks*!Ga0mh8hC4GRP2HXZHYYM$f>+4r(wi2I7XxRd)E-CGge<2eIeI$ zwILGY#`c6om|D(bk_znZg;Mn%XpjJf{7fy6nSY5t^|C^Lv?B} z2JNQJ+qE2G&A(MWFq#NquCO$g=Y&3^%hi2G1r*kpVv1iOgt#Wt{t^ri!sU{0tgVTq z3IWqYrO1T{kX35jMm^YeW6yO@q#j);4n17F}jm zw|B^F5wlutfmvNPylh+)?S)O0N@&8eDY98vtr(};hC{7M$K+%SdQIlF0ePF6myA_t zHW+)E*hOM546k8fGbe|HcVu8c0&j-jZS3ns-;LzE7JNs-W z?mD#V!mje*WnB;IDs>NsZU9|}b9H0cn7fg39mrKCT*mbxuF|au*8$w_Fwgl( zO{{H4ho;&cnbw*~uXyhLDBDE!B9gTN*zT`Ku-er|fOXK-Cf6Rbwwu}_tCv_iSTuM; zqhiM$>p|7wL<8@mdo?*dIQ($hErmg}q-kuZ zoB}}pfEX;LdnugJpP+F3{{7K$|F(Zh5!biwc%MACapvAx$v==pjbBajm<6@BXaSr4&w=r2eq|qOaZND?>tgDbK*i7RlwlcAv4W*9%w~{<<&aRR~~J5}^Yz z$jEX5hqv#&q07X`!|G)j|E@0?I;vhwPBgzs|G`6>PR_pZ>chh#0-n~(MsjNxlckEr*v{!yesP~}T z9otG3NaF>(Zk$v3BZ_KpWfcyDkJCwuoF$?+Z@#P`YjDEas? zit67Yzu(||K~CYwjOG2$K`Gx}WMa)Pm++sag~4mI?$NHL*eRM9U`9jg(X}9V`Tw-m2<<3`K>X HoE-oF;%Iw* literal 1820 zcmV+%2jlo3iwFP!000000Nt7YPun;Y!2g&0bko8PC}UfHE2RuHHY!?ir<;&v5~uY@ z;+*4hFqHrOo)gEh9k-=O`(@KKEzi&Ii~aokN%|b*I3vL@xX5GjNaE|8(^)~H!ypXM zJ$VX---DDzs}i1Znv#^1!7U6$EGCgu42I9Gh51uK;;|k8Eh0rQ@^*59OEMFD6^ZBC z+|Uw5EEjYkWi-e9DJ)n?1!Xx%E^;wAtU~1*1S8Jz(|4tL8}h~A18Pj}W|5aP9=5|J zu^pg>aF(boZp@fx)ge@!5Rf(dMD)(yWjxqr|P+Mpt(rF<E)2OVBEDuOF{Dm*h)B@8N?V*%66&rC|e1ZY*JX6(u6(` zaA_8-vES{eiikRUC96Ykz!&xt-;d$oK+d)hv`M}YOD#QxNHBPsv+?Py^)<#tC!Yz- zMLS}OSXqHcDpo6MWdkHsAsRYGr2&2QL(ILYC`SRtDckpX7j75cMPWjSu4>B zP^!{y^q_pvdJl5cveF7{D^*q9hN^1J@_&RDRjV8yg$seefpEIyXxzFQebfIt)8`>v zvP_?}@#x~T>q|=#-9y19(wy*^8Gh&yU(G0wMXYANH5!w=Y#F4Y}YC8=hrok~9#Xq`&K%KGr$*2oC zrgs}6F($SbEW*@s9+OmHcP*5v_dbIJDAccIWFjTbwpO8JA?UL95v?Ke!t7fK_>y<( z)Ibc{HJi6|$;zh4MrAc(TxuKcv?d)BlP%^ok=N$qZE9XH zR-xHk>_uXih`lVl28GR>91z|Sf&B!$nSHmRua|r`g6~@HHIY{>twp@1OareH+Q7FK zcfGXhz^==>%7YhmJ*2DDJs7%ya~;gpg=JIjM#yy-SDA1T*Gsrcw+38?Z@bG(gRQ>Q zOsqAjwrw36YIj6ht0%qU-1|wkiRwilYXz{~Uk_lltBw5XkgE-@J!WkewMAAhv39U% z@P6vECi*FL2cK);^aVZ@nXX|{yF)4>T?3=rSk!}}9uoC|=td6pVQ8O1 z{Sdl^L2U%>9azu4blV|e(FD&Pa~jC$5T^y47H`_PY0+j6Ha(oV88a=)^Z;g$FI}|U zN|x5??_r!tlzpD;;iOG$229!mHcDFE#UFv<$c`fKR!Ks*^TGFWa&mC^@uV9HgJ?<9 z*ibnEfV_toETuasoY9}4aQy!L;J82SpHjs2)E(!^dmCr&%u4=&Bx?L}a=2(^^%rkW zQn_CsfA{XsT3t%vc1h~*+AsRpZN4%jl$Y`jd~A_iu50%d+j_l#k>RiVP+o-qRwWTS z5QB^?7jSv|-W$41j6AGbmhtcUlA)vO#pFcuo75j1+8n3fpln`OPqjZ@e?;+AP+hQ+ z?SY={&FBl*=e*LdLsI&bWcX~#0u3t*LAKXEs0H|gCA)tAbeM1IHGg++VihlF|! zy4A6*RDm>J!0W~tmA@?xzX$SRZ-4Lor%wdO@!=jhj0by1gJ`ga?=armj}DT-e)4H> z7a!C=MgF+P`GTCll^M(V&p|2QUSwj;50~(tri8(3wB!d6PFupk@L+#`|DfuA+XVk} zP#6WU;{m+&zSPXo;nDHYz_B!*OIzdG7R{q{6_Z&L8zfJ(Y$DxANhQ<4LU8G=s{RL_ Ko8{*FkzCZf+Kf|d(8@muWrcj|DT@_t-kM`uE3T%-D)G_L zRE>W+o9H92Xm93Yt~RzBZNROZ^<1r(Bel3~-FDZTtwwWYyhm>9X{*sncy7QL&kTs7 zG%*{9$^WM`5e7?dbVfhyB$Vv0LC*9vqycoqIcIzmJ`MlrXT5LL=1hc=C__D_A3b6Z zauh65y1`~Jdu_)ZW>*|rT=l)Ht#2PkZg=J5t{h*??RJfrvaTbqc4x~iC#k*OvEos6 zvLenq^Q^Dkc9$qOEIx~y=c_HQrlw+r*wJDUTYNLVR_|S}$tt@a-hr2d2R~iUug9R} z6(P3+>Ug^I<@@z=3v_5;o>Fe(C^XPwS>E&^52Qai>F1CN8ROf%TpmROsGF!VF8?yN z3oyzUGZ0D^9Un>%F*~G%R?jT@myIW0$N&HZ>HmLM;9!Luagp>dso;(-gu=r=SzslA zyg?A+QARjDg26z)|&rMSu_neLRrv^97`+ybmBHR3S`a;@1^1@QY5Ly z8Wmc|4hei-Bo*|*b0(DdA7^ASodc5V1K=nvjT>aN(BE$ujUXTtE}5_sc30Kax+JR8 zw`dEuQq1zyDK^@gahc^=wV4CP&KwX$ZPdh_xQi>%L>VkK>ACc)-_h;tetJLO3)thW zp1xpQbUMI*iKxLXGBhJ2K}rge6w(3kLZPAQ5&+^@24fkLVG#%sF+yULsd!{$f^h@Y zqTFlJbzJceoyQvwG;*F$nJo6mw-gRWMLfrcMs*W^FU z@ihhz8%(sdf5OBD`(#|Ri9(pJGGG=~yHxUbo`|$d|Aapc000000000ewJ-f300961 D1I@IY diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc new file mode 100644 index 0000000000000000000000000000000000000000..d5237ae1e9df036cb6c40d3d1560c5485433e88b GIT binary patch literal 939 zcmV;c162H$0{{TT3IG5owJ-f(#0f`*vOVC zegJj=k^uFds7ktLaogK7Ub&UC%&(Xuxu|!w8LQYKmVXjmi9G6Qixq9&nqyll&Zf39 z0Xl1{#=p}h`p7HVoB5dQjjcu-a4Tm$S1aa7EpA)4-SuXx(OMbrk=tt8YBUp{8xZC* z1D&UC!*>cNE zYOi;!cubkBi1W@o>ua~&WeN_9&*J9!YKya}rB)(#v{=Lz-;A%-dk4E9-T@bc2EX&? z*Xwa{J;?2VC+{>}zF#lLvr`lElyV$5LIWF(e zET9rV;vo3&rx!UvhDg{6;z>R|r!L2{b9$=n*mcJNPggVyI>OixjU7iS-elr8I0>W< z49}I~DNyuCjW#B@kQ)*J{Rk>3gzH2Y@jueUWV!|(tq+i6bTnp|(L#UE@gf8PqVP!e zGGA9!TdZ4>GChm3FiS;Nnp(w1Su`zKX;v-E0bynic%n8c!Kj40#Fmj&px;PHvCA=}Ud<4nj@mfHzM86>-Y-AP{bYTe}SbOpst5`s188fLVjC zR*Zp$A7t0$Kg{tp1`r!ew6%Z2#0L9hT(gNnn65Hl7FN4d@^_wyv`hYkKMeo?00000 N04TLD{U87V008%0x4-}Z literal 0 HcmV?d00001 From 76de8b659ff684c8d160ab85a562ce9a1a33aaec Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 13:40:23 -0400 Subject: [PATCH 35/55] add hgmd --- v03_pipeline/lib/tasks/exports/misc.py | 6 ++++-- .../lib/tasks/exports/write_new_variants_parquet.py | 13 +++++-------- .../exports/write_new_variants_parquet_test.py | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 7321b3647..999f96ae7 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -176,8 +176,10 @@ def unmap_reference_dataset_annotation_enums( }, ) - # Explicit hgmd "class" edge case: - if hasattr(ht, ReferenceDataset.hgmd.value) and ReferenceDataset.hgmd in reference_datasets: + # Explicit hgmd edge case: + if (ReferenceDataset.hgmd in reference_datasets) and hasattr( + ht, ReferenceDataset.hgmd.value, + ): ht = ht.annotate( **{ ReferenceDataset.hgmd.value: ht[ diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 2ccf80417..5653166a0 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -6,7 +6,6 @@ new_variants_parquet_path, new_variants_table_path, ) -from v03_pipeline.lib.reference_datasets.reference_dataset import BaseReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -83,6 +82,11 @@ def create_table(self) -> None: CAID=ht.CAID, liftedOverChrom=ht.rg37_locus.contig, liftedOverPos=ht.rg37_locus.position, + hgmd=( + ht.hgmd + if hasattr(ht, 'hgmd') + else hl.missing(hl.tstruct(accession=hl.tstr, class_=hl.tstr)) + ), screenRegionType=ht.screen.region_types.first(), predictions=hl.Struct( cadd=ht.dbnsfp.CADD_phred, @@ -135,11 +139,4 @@ def create_table(self) -> None: ), ), **{f: ht[f] for f in array_structexpression_fields(ht)}, - **{ - rd: ht[rd] - for rd in BaseReferenceDataset.for_reference_genome_dataset_type_private( - self.reference_genome, - self.dataset_type, - ) - }, ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index ffaeda122..c5c3592a9 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -85,6 +85,7 @@ def test_write_new_variants_parquet_test( 'CAID': 'CA502654', 'liftedOverChrom': '1', 'liftedOverPos': 874501, + 'hgmd': {'accession': 'abcdefg', 'class_': 'DFP'}, 'screenRegionType': None, 'predictions': { 'cadd': 23.5, @@ -159,7 +160,6 @@ def test_write_new_variants_parquet_test( 'geneId': 'ENSG00000187634', }, ], - 'hgmd': None, }, ], ) From 125ea9509eada40c03326ddc2dfbc9dc0c48ddc1 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 15:05:28 -0400 Subject: [PATCH 36/55] sort it --- v03_pipeline/lib/tasks/exports/misc.py | 31 +++++++++---------- .../exports/write_new_entries_parquet_test.py | 19 ++++++++++++ .../exports/write_new_transcripts_parquet.py | 3 +- .../write_new_transcripts_parquet_test.py | 4 +++ .../exports/write_new_variants_parquet.py | 2 +- 5 files changed, 40 insertions(+), 19 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 999f96ae7..ef5fca902 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -28,16 +28,14 @@ def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: def array_structexpression_fields(ht: hl.Table): - return sorted( - [ - field - for field in ht.row - if isinstance( - ht[field], - hl.expr.expressions.typed_expressions.ArrayStructExpression, - ) - ], - ) + return [ + field + for field in ht.row + if isinstance( + ht[field], + hl.expr.expressions.typed_expressions.ArrayStructExpression, + ) + ] def transcripts_field_name( @@ -178,16 +176,15 @@ def unmap_reference_dataset_annotation_enums( # Explicit hgmd edge case: if (ReferenceDataset.hgmd in reference_datasets) and hasattr( - ht, ReferenceDataset.hgmd.value, + ht, + ReferenceDataset.hgmd.value, ): ht = ht.annotate( **{ - ReferenceDataset.hgmd.value: ht[ - ReferenceDataset.hgmd.value - ].annotate( - class_=ht[ReferenceDataset.hgmd.value]['class'] - ).drop('class') - } + ReferenceDataset.hgmd.value: ht[ReferenceDataset.hgmd.value] + .annotate(class_=ht[ReferenceDataset.hgmd.value]['class']) + .drop('class'), + }, ) return ht.annotate_globals(enums=ht.globals.enums.drop(*unmapped_annotation_name)) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index b37664242..605e57ec4 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -119,3 +119,22 @@ def test_write_new_entries_parquet(self): }, ], ) + self.assertEqual( + export_json[-1], + { + 'key': 27, + 'family_guid': 'def_1', + 'filters': [], + 'is_gnomad_gt_5_percent': False, + 'project_guid': 'R0114_project4', + 'sample_type': 'WGS', + 'xpos': 1000902024, + 'calls': { + 'sampleId': ['NA20885_1'], + 'gt': [1], + 'gq': [4], + 'ab': [0.10000000149011612], + 'dp': [10], + }, + }, + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 126085093..28cc1294e 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -63,8 +63,9 @@ def create_table(self) -> None: transcripts=ht[ transcripts_field_name(self.reference_genome, self.dataset_type) ] + .map(lambda s: hl.struct(**{k: s[k] for k in sorted(s)})) .group_by( lambda c: c.geneId, ) - .items(), + .items() ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 8c0e651dc..4f0533589 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -129,3 +129,7 @@ def test_write_new_transcripts_parquet_test( 'consequenceTerms': ['missense_variant'], }, ) + self.assertEqual( + list(export_json[0]['transcripts'][0]['_1'][0].keys()), + sorted(export_json[0]['transcripts'][0]['_1'][0].keys()), + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 5653166a0..d1c8fe2e2 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -138,5 +138,5 @@ def create_table(self) -> None: hom=ht.topmed.Hom, ), ), - **{f: ht[f] for f in array_structexpression_fields(ht)}, + **{f: ht[f] for f in sorted(array_structexpression_fields(ht))}, ) From 9528d3cbcf32432e0acf02c96e0a09e1ff038545 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 15:37:34 -0400 Subject: [PATCH 37/55] ruff --- .../lib/tasks/exports/write_new_transcripts_parquet.py | 2 +- .../exports/write_new_transcripts_parquet_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 28cc1294e..41c74ae8c 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -67,5 +67,5 @@ def create_table(self) -> None: .group_by( lambda c: c.geneId, ) - .items() + .items(), ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 4f0533589..2a60d5cbb 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -101,19 +101,21 @@ def test_write_new_transcripts_parquet_test( self.assertEqual( export_json[0]['transcripts'][0]['_1'][0], { + 'alphamissense': {'pathogenicity': None}, 'aminoAcids': 'S/L', + 'biotype': 'protein_coding', 'canonical': 1.0, 'codons': 'tCg/tTg', + 'consequenceTerms': ['missense_variant'], + 'exon': {'index': 6, 'total': 14}, 'geneId': 'ENSG00000187634', 'hgvsc': 'ENST00000616016.5:c.1049C>T', 'hgvsp': 'ENSP00000478421.2:p.Ser350Leu', + 'intron': None, 'transcriptId': 'ENST00000616016', 'maneSelect': 'NM_001385641.1', 'manePlusClinical': None, - 'exon': {'index': 6, 'total': 14}, - 'intron': None, 'refseqTranscriptId': 'NM_001385641.1', - 'alphamissense': {'pathogenicity': None}, 'loftee': {'isLofNagnag': None, 'lofFilters': None}, 'spliceregion': { 'extended_intronic_splice_region_variant': False, @@ -125,8 +127,6 @@ def test_write_new_transcripts_parquet_test( 'fiveutrAnnotation': None, 'fiveutrConsequence': None, }, - 'biotype': 'protein_coding', - 'consequenceTerms': ['missense_variant'], }, ) self.assertEqual( From bb5aab364f3a3c82e74fcd2ea9710b0c203b7039 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 24 Apr 2025 17:41:02 -0400 Subject: [PATCH 38/55] bugfixes --- v03_pipeline/lib/tasks/exports/misc.py | 2 +- v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index ef5fca902..470e7cbe7 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -175,7 +175,7 @@ def unmap_reference_dataset_annotation_enums( ) # Explicit hgmd edge case: - if (ReferenceDataset.hgmd in reference_datasets) and hasattr( + if hasattr( ht, ReferenceDataset.hgmd.value, ): diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 6ac897125..d82889176 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -114,7 +114,7 @@ def create_table(self) -> None: ht = ht.key_by() ht = ht.select( - key=ht.key_, + key_=ht.key_, project_guid=project_guid, family_guid=ht.family_entries.family_guid[0], sample_type=self.sample_type.value, From 9d9a13830b6a250e2f8c4cf24a83b4d4def97bde Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 25 Apr 2025 11:00:55 -0400 Subject: [PATCH 39/55] update test to new format --- .../exports/write_new_entries_parquet.py | 26 ++++---- .../exports/write_new_entries_parquet_test.py | 66 +++++++++++++------ 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index d82889176..5d2ff90f8 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -89,8 +89,9 @@ def create_table(self) -> None: ht = ht.annotate( family_entries=hl.enumerate(ht.family_entries).starmap( lambda i, fs: hl.enumerate(fs).starmap( - lambda _, e: e.annotate( + lambda j, e: e.annotate( family_guid=ht.family_guids[i], # noqa: B023 + sampleId=ht.family_samples[ht.family_guids[i]][j], ), ), ), @@ -108,7 +109,7 @@ def create_table(self) -> None: # the family entries ht will contain rows # where at least one family is defined... after explosion, - # those rows should be removed. + # rows where a family is not defined should be removed. ht = ht.explode(ht.family_entries) ht = ht.filter(hl.is_defined(ht.family_entries)) @@ -121,19 +122,20 @@ def create_table(self) -> None: xpos=get_expr_for_xpos(ht.locus), is_gnomad_gt_5_percent=hl.is_defined(ht.is_gt_5_percent), filters=ht.filters, - calls=hl.Struct( - sampleId=ht.family_samples[ht.family_entries.family_guid[0]], - gt=ht.family_entries.GT.map( - lambda x: hl.case() - .when(x.is_hom_ref(), 0) - .when(x.is_het(), 1) - .when(x.is_hom_var(), 2) + calls=ht.family_entries.map( + lambda fe: hl.Struct( + sampleId=fe.sampleId, + gt=hl.case() + .when(fe.GT.is_hom_ref(), 0) + .when(fe.GT.is_het(), 1) + .when(fe.GT.is_hom_var(), 2) .default(hl.missing(hl.tint32)), + gq=fe.GQ, + ab=fe.AB, + dp=fe.DP, ), - gq=ht.family_entries.GQ, - ab=ht.family_entries.AB, - dp=ht.family_entries.DP, ), + sign=1, ) unioned_ht = unioned_ht.union(ht) if unioned_ht else ht return unioned_ht diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 605e57ec4..701cd2e9a 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -93,13 +93,30 @@ def test_write_new_entries_parquet(self): 'xpos': 1000876499, 'is_gnomad_gt_5_percent': False, 'filters': [], - 'calls': { - 'sampleId': ['HG00731_1', 'HG00732_1', 'HG00733_1'], - 'gt': [2, 2, 2], - 'gq': [21, 24, 12], - 'ab': [1.0, 1.0, 1.0], - 'dp': [7, 8, 4], - }, + 'calls': [ + { + 'sampleId': 'HG00731_1', + 'gt': 2, + 'gq': 21, + 'ab': 1.0, + 'dp': 7, + }, + { + 'sampleId': 'HG00732_1', + 'gt': 2, + 'gq': 24, + 'ab': 1.0, + 'dp': 8, + }, + { + 'sampleId': 'HG00733_1', + 'gt': 2, + 'gq': 12, + 'ab': 1.0, + 'dp': 4, + }, + ], + 'sign': 1, }, { 'key': 3, @@ -109,16 +126,28 @@ def test_write_new_entries_parquet(self): 'xpos': 1000878314, 'is_gnomad_gt_5_percent': False, 'filters': ['VQSRTrancheSNP99.00to99.90'], - 'calls': { - 'sampleId': ['HG00731_1', 'HG00732_1', 'HG00733_1'], - 'gt': [1, 0, 1], - 'gq': [30, 6, 61], - 'ab': [0.3333333432674408, 0.0, 0.6000000238418579], - 'dp': [3, 2, 5], - }, + 'calls': [ + { + 'sampleId': 'HG00731_1', + 'gt': 1, + 'gq': 30, + 'ab': 0.3333333432674408, + 'dp': 3, + }, + {'sampleId': 'HG00732_1', 'gt': 0, 'gq': 6, 'ab': 0.0, 'dp': 2}, + { + 'sampleId': 'HG00733_1', + 'gt': 1, + 'gq': 61, + 'ab': 0.6000000238418579, + 'dp': 5, + }, + ], + 'sign': 1, }, ], ) + print(export_json[-1]) self.assertEqual( export_json[-1], { @@ -129,12 +158,7 @@ def test_write_new_entries_parquet(self): 'project_guid': 'R0114_project4', 'sample_type': 'WGS', 'xpos': 1000902024, - 'calls': { - 'sampleId': ['NA20885_1'], - 'gt': [1], - 'gq': [4], - 'ab': [0.10000000149011612], - 'dp': [10], - }, + 'calls': [{'sampleId': 'NA20885_1', 'gt': 1, 'gq': 4, 'ab': 0.10000000149011612, 'dp': 10}], + 'sign': 1, }, ) From 146929878f24b198a8f4222ea58d1948cbff5cd2 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 25 Apr 2025 11:04:31 -0400 Subject: [PATCH 40/55] ruff --- .../lib/tasks/exports/write_new_entries_parquet.py | 2 +- .../tasks/exports/write_new_entries_parquet_test.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 5d2ff90f8..6955a579e 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -91,7 +91,7 @@ def create_table(self) -> None: lambda i, fs: hl.enumerate(fs).starmap( lambda j, e: e.annotate( family_guid=ht.family_guids[i], # noqa: B023 - sampleId=ht.family_samples[ht.family_guids[i]][j], + sampleId=ht.family_samples[ht.family_guids[i]][j], # noqa: B023 ), ), ), diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 701cd2e9a..7650bfb7d 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -147,7 +147,6 @@ def test_write_new_entries_parquet(self): }, ], ) - print(export_json[-1]) self.assertEqual( export_json[-1], { @@ -158,7 +157,15 @@ def test_write_new_entries_parquet(self): 'project_guid': 'R0114_project4', 'sample_type': 'WGS', 'xpos': 1000902024, - 'calls': [{'sampleId': 'NA20885_1', 'gt': 1, 'gq': 4, 'ab': 0.10000000149011612, 'dp': 10}], + 'calls': [ + { + 'sampleId': 'NA20885_1', + 'gt': 1, + 'gq': 4, + 'ab': 0.10000000149011612, + 'dp': 10, + }, + ], 'sign': 1, }, ) From ac967b642a6f81e7b9b4681060c2732fe08908b3 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 25 Apr 2025 14:26:30 -0400 Subject: [PATCH 41/55] v03 --- v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 6955a579e..2242504c7 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -91,7 +91,7 @@ def create_table(self) -> None: lambda i, fs: hl.enumerate(fs).starmap( lambda j, e: e.annotate( family_guid=ht.family_guids[i], # noqa: B023 - sampleId=ht.family_samples[ht.family_guids[i]][j], # noqa: B023 + sampleId=ht.family_samples[ht.family_guids[i]][j], # noqa: B023 ), ), ), From bd1a037688ab16c6fe91c795f8600990e75ed043 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 28 Apr 2025 11:32:55 -0400 Subject: [PATCH 42/55] merge --- v03_pipeline/lib/annotations/misc.py | 1 + .../lib/tasks/base/base_update_variant_annotations_table.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/v03_pipeline/lib/annotations/misc.py b/v03_pipeline/lib/annotations/misc.py index c764624ff..af7c359a6 100644 --- a/v03_pipeline/lib/annotations/misc.py +++ b/v03_pipeline/lib/annotations/misc.py @@ -46,6 +46,7 @@ def annotate_reference_dataset_globals( ) return ht + def annotate_formatting_annotation_enum_globals( ht: hl.Table, reference_genome: ReferenceGenome, diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index f8c8f82d4..f16f50bc6 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -1,9 +1,6 @@ import hail as hl import luigi -from v03_pipeline.lib.annotations.misc import ( - annotate_formatting_annotation_enum_globals, -) from v03_pipeline.lib.paths import ( variant_annotations_table_path, ) From 58b33558922e7152938f46eb5f22dc8bbb07443b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 28 Apr 2025 12:09:28 -0400 Subject: [PATCH 43/55] no longer used --- .../lib/reference_datasets/reference_dataset.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py index 7dac429ae..8ed022cfb 100644 --- a/v03_pipeline/lib/reference_datasets/reference_dataset.py +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -56,21 +56,6 @@ def for_reference_genome_dataset_type( } return set(reference_datasets) - @classmethod - def for_reference_genome_dataset_type_private( - cls, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - ) -> set[Union['ReferenceDataset']]: - return { - dataset - for dataset in cls.for_reference_genome_dataset_type( - reference_genome, - dataset_type, - ) - if dataset.access_control == AccessControl.PRIVATE - } - @classmethod def for_reference_genome_dataset_type_annotations( cls, From ae807c2ab89bccf657ed802637cffd5efd2bc6b1 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 28 Apr 2025 20:12:51 -0400 Subject: [PATCH 44/55] lint --- v03_pipeline/lib/model/feature_flag.py | 2 ++ v03_pipeline/lib/tasks/run_pipeline.py | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py index a24b29b6b..8948f4de7 100644 --- a/v03_pipeline/lib/model/feature_flag.py +++ b/v03_pipeline/lib/model/feature_flag.py @@ -7,6 +7,7 @@ ) CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1' +EXPORT_TO_PARQUET = os.environ.get('EXPORT_TO_PARQUET') == '1' INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' ) @@ -21,6 +22,7 @@ class FeatureFlag: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS + EXPORT_TO_PARQUET: bool = EXPORT_TO_PARQUET INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX RUN_PIPELINE_ON_DATAPROC: bool = RUN_PIPELINE_ON_DATAPROC SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py index 1711143d2..0fe0a9093 100644 --- a/v03_pipeline/lib/tasks/run_pipeline.py +++ b/v03_pipeline/lib/tasks/run_pipeline.py @@ -4,6 +4,15 @@ from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) +from v03_pipeline.lib.tasks.exports.write_new_entries_parquet import ( + WriteNewEntriesParquetTask, +) +from v03_pipeline.lib.tasks.exports.write_new_transcripts_parquet import ( + WriteNewTranscriptsParquetTask, +) +from v03_pipeline.lib.tasks.exports.write_new_variants_parquet import ( + WriteNewVariantsParquetTask, +) from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( UpdateVariantAnnotationsTableWithNewSamplesTask, ) @@ -11,17 +20,15 @@ from v03_pipeline.lib.tasks.write_project_family_tables import ( WriteProjectFamilyTablesTask, ) +from v03_pipeline.lib.model.feature_flag import FeatureFlag @luigi.util.inherits(BaseLoadingRunParams) class RunPipelineTask(luigi.WrapperTask): def requires(self): - requirements = [ + return [ self.clone(WriteMetadataForRunTask), self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask), - ] - return [ - *requirements, *[ self.clone( WriteProjectFamilyTablesTask, @@ -29,4 +36,13 @@ def requires(self): ) for i in range(len(self.project_guids)) ], + *( + [ + self.clone(WriteNewEntriesParquetTask), + self.clone(WriteNewTranscriptsParquetTask), + self.clone(WriteNewVariantsParquetTask), + ] + if FeatureFlag.EXPORT_TO_PARQUET + else [] + ), ] From 026c1ba6e694e6cc8429e65d4a2cbda2ec18ad59 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 01:20:02 -0400 Subject: [PATCH 45/55] formatting --- .../tasks/exports/write_new_entries_parquet.py | 18 +++++++++++------- .../exports/write_new_variants_parquet.py | 5 +---- v03_pipeline/lib/tasks/run_pipeline.py | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py index 2242504c7..eb5a465f9 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py @@ -29,6 +29,10 @@ WriteRemappedAndSubsettedCallsetTask, ) +ANNOTATIONS_TABLE_TASK = 'annotations_table_task' +HIGH_AF_VARIANTS_TABLE_TASK = 'high_af_variants_table_task' +REMAPPED_AND_SUBSETTED_CALLSET_TASKS = 'remapped_and_subsetted_callset_tasks' + @luigi.util.inherits(BaseLoadingRunParams) class WriteNewEntriesParquetTask(BaseWriteParquetTask): @@ -43,10 +47,10 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: return { - 'annotations_table_task': ( + ANNOTATIONS_TABLE_TASK: ( self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask) ), - 'remapped_and_subsetted_callset_tasks': [ + REMAPPED_AND_SUBSETTED_CALLSET_TASKS: [ self.clone( WriteRemappedAndSubsettedCallsetTask, project_i=i, @@ -55,7 +59,7 @@ def requires(self) -> list[luigi.Task]: ], **( { - 'high_af_variants_table_task': self.clone( + HIGH_AF_VARIANTS_TABLE_TASK: self.clone( UpdatedReferenceDatasetQueryTask, reference_dataset_query=ReferenceDatasetQuery.high_af_variants, ), @@ -73,7 +77,7 @@ def create_table(self) -> None: unioned_ht = None for project_guid, remapped_and_subsetted_callset_task in zip( self.project_guids, - self.input()['remapped_and_subsetted_callset_tasks'], + self.input()[REMAPPED_AND_SUBSETTED_CALLSET_TASKS], strict=True, ): mt = hl.read_matrix_table(remapped_and_subsetted_callset_task.path) @@ -97,13 +101,13 @@ def create_table(self) -> None: ), ) annotations_ht = hl.read_table( - self.input()['annotations_table_task'].path, + self.input()[ANNOTATIONS_TABLE_TASK].path, ) ht = ht.join(annotations_ht) - if self.input().get('high_af_variants_table_task'): + if self.input().get(HIGH_AF_VARIANTS_TABLE_TASK): gnomad_high_af_ht = hl.read_table( - self.input()['high_af_variants_table_task'].path, + self.input()[HIGH_AF_VARIANTS_TABLE_TASK].path, ) ht = ht.join(gnomad_high_af_ht, 'left') diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index d1c8fe2e2..36790b738 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -17,7 +17,7 @@ unmap_formatting_annotation_enums, unmap_reference_dataset_annotation_enums, ) -from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget, GCSorLocalTarget +from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( UpdateNewVariantsWithCAIDsTask, ) @@ -35,9 +35,6 @@ def output(self) -> luigi.Target: ), ) - def complete(self) -> luigi.Target: - return GCSorLocalFolderTarget(self.output().path).exists() - def requires(self) -> list[luigi.Task]: return [ self.clone(UpdateNewVariantsWithCAIDsTask) diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py index 0fe0a9093..e13cf595d 100644 --- a/v03_pipeline/lib/tasks/run_pipeline.py +++ b/v03_pipeline/lib/tasks/run_pipeline.py @@ -1,6 +1,7 @@ import luigi import luigi.util +from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -20,7 +21,6 @@ from v03_pipeline.lib.tasks.write_project_family_tables import ( WriteProjectFamilyTablesTask, ) -from v03_pipeline.lib.model.feature_flag import FeatureFlag @luigi.util.inherits(BaseLoadingRunParams) From 6b172caf25b8a8699d1016e5670d499608ac6b7c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 09:59:19 -0400 Subject: [PATCH 46/55] special case the export --- v03_pipeline/lib/model/dataset_type.py | 5 +++++ v03_pipeline/lib/tasks/run_pipeline.py | 1 + 2 files changed, 6 insertions(+) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 0b421a138..9e71cb638 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -382,6 +382,11 @@ def filter_invalid_sites(self): def should_export_to_vcf(self): return self == DatasetType.SV + def should_export_to_parquet(self, reference_genome: ReferenceGenome): + return ( + self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38 + ) + @property def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: return { diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py index e13cf595d..45d84e289 100644 --- a/v03_pipeline/lib/tasks/run_pipeline.py +++ b/v03_pipeline/lib/tasks/run_pipeline.py @@ -43,6 +43,7 @@ def requires(self): self.clone(WriteNewVariantsParquetTask), ] if FeatureFlag.EXPORT_TO_PARQUET + and self.dataset_type.should_export_to_parquet(self.reference_genome) else [] ), ] From 6f4b736f483e07aecf9995ffc26553876f6e503d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 16:03:52 -0400 Subject: [PATCH 47/55] remove gene/map --- .../lib/tasks/exports/write_new_transcripts_parquet.py | 4 ---- .../exports/write_new_transcripts_parquet_test.py | 10 +++------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 41c74ae8c..655f850e8 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -64,8 +64,4 @@ def create_table(self) -> None: transcripts_field_name(self.reference_genome, self.dataset_type) ] .map(lambda s: hl.struct(**{k: s[k] for k in sorted(s)})) - .group_by( - lambda c: c.geneId, - ) - .items(), ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 2a60d5cbb..51acd07e8 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -95,11 +95,7 @@ def test_write_new_transcripts_parquet_test( 0, ) self.assertEqual( - export_json[0]['transcripts'][0]['_0'], - 'ENSG00000187634', - ) - self.assertEqual( - export_json[0]['transcripts'][0]['_1'][0], + export_json[0]['transcripts'][0], { 'alphamissense': {'pathogenicity': None}, 'aminoAcids': 'S/L', @@ -130,6 +126,6 @@ def test_write_new_transcripts_parquet_test( }, ) self.assertEqual( - list(export_json[0]['transcripts'][0]['_1'][0].keys()), - sorted(export_json[0]['transcripts'][0]['_1'][0].keys()), + list(export_json[0]['transcripts'][0].keys()), + sorted(export_json[0]['transcripts'][0].keys()), ) From 5af48f5d7160ef64064a83f1f962975885c2d5dc Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 16:07:26 -0400 Subject: [PATCH 48/55] ruff --- .../lib/tasks/exports/write_new_transcripts_parquet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 655f850e8..fc44c5199 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -62,6 +62,5 @@ def create_table(self) -> None: key_=ht.key_, transcripts=ht[ transcripts_field_name(self.reference_genome, self.dataset_type) - ] - .map(lambda s: hl.struct(**{k: s[k] for k in sorted(s)})) + ].map(lambda s: hl.struct(**{k: s[k] for k in sorted(s)})), ) From 585724fbe112cdbb50e440b63f3888d409ef9b87 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 18:31:38 -0400 Subject: [PATCH 49/55] add new annotations --- .../tasks/exports/write_new_transcripts_parquet.py | 13 ++++++++++--- .../exports/write_new_transcripts_parquet_test.py | 3 +++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index fc44c5199..43665a70b 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -60,7 +60,14 @@ def create_table(self) -> None: ht = ht.key_by() return ht.select( key_=ht.key_, - transcripts=ht[ - transcripts_field_name(self.reference_genome, self.dataset_type) - ].map(lambda s: hl.struct(**{k: s[k] for k in sorted(s)})), + transcripts=hl.enumerate( + ht[transcripts_field_name(self.reference_genome, self.dataset_type)] + ) + .starmap( + lambda i, s: s.annotate( + majorConsequence=s.consequenceTerms.first(), + transcriptRank=i, + ) + ) + .map(lambda s: s.select(**{k: s[k] for k in sorted(s)})), ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 51acd07e8..ee8136883 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -94,6 +94,7 @@ def test_write_new_transcripts_parquet_test( export_json[0]['key'], 0, ) + print(export_json[0]['transcripts'][0]) self.assertEqual( export_json[0]['transcripts'][0], { @@ -109,6 +110,8 @@ def test_write_new_transcripts_parquet_test( 'hgvsp': 'ENSP00000478421.2:p.Ser350Leu', 'intron': None, 'transcriptId': 'ENST00000616016', + 'transcriptRank': 0, + 'majorConsequence': 'missense_variant', 'maneSelect': 'NM_001385641.1', 'manePlusClinical': None, 'refseqTranscriptId': 'NM_001385641.1', From 459e5102db6ce00bd047963d79a642b8a57b0d00 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 18:32:11 -0400 Subject: [PATCH 50/55] print --- .../lib/tasks/exports/write_new_transcripts_parquet_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index ee8136883..1e89676e0 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -94,7 +94,6 @@ def test_write_new_transcripts_parquet_test( export_json[0]['key'], 0, ) - print(export_json[0]['transcripts'][0]) self.assertEqual( export_json[0]['transcripts'][0], { From c19c16b484f09ad23da9990989cea6b7a13b5198 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 29 Apr 2025 18:32:25 -0400 Subject: [PATCH 51/55] ruff --- .../lib/tasks/exports/write_new_transcripts_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index 43665a70b..d3f96c518 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -61,13 +61,13 @@ def create_table(self) -> None: return ht.select( key_=ht.key_, transcripts=hl.enumerate( - ht[transcripts_field_name(self.reference_genome, self.dataset_type)] + ht[transcripts_field_name(self.reference_genome, self.dataset_type)], ) .starmap( lambda i, s: s.annotate( majorConsequence=s.consequenceTerms.first(), transcriptRank=i, - ) + ), ) .map(lambda s: s.select(**{k: s[k] for k in sorted(s)})), ) From ca2daf5f2c19eb4456a36bfd191f3fecbf19aed6 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 5 May 2025 11:55:12 -0400 Subject: [PATCH 52/55] canonical is not a float --- .../lib/tasks/exports/write_new_transcripts_parquet_test.py | 2 +- .../lib/tasks/exports/write_new_variants_parquet_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 1e89676e0..5be6a1453 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -100,7 +100,7 @@ def test_write_new_transcripts_parquet_test( 'alphamissense': {'pathogenicity': None}, 'aminoAcids': 'S/L', 'biotype': 'protein_coding', - 'canonical': 1.0, + 'canonical': 1, 'codons': 'tCg/tTg', 'consequenceTerms': ['missense_variant'], 'exon': {'index': 6, 'total': 14}, diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index c5c3592a9..285389063 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -153,7 +153,7 @@ def test_write_new_variants_parquet_test( 'sortedTranscriptConsequences': [ { 'alphamissensePathogenicity': None, - 'canonical': 1.0, + 'canonical': 1, 'consequenceTerms': ['missense_variant'], 'extendedIntronicSpliceRegionVariant': False, 'fiveutrConsequence': None, From 1b4b2653f73b2e782c741e5b3853777d40b9e8fa Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 6 May 2025 17:15:48 -0400 Subject: [PATCH 53/55] add new clinvar variants parquet export (#1092) --- v03_pipeline/lib/paths.py | 15 ++++ .../write_new_clinvar_variants_parquet.py | 58 +++++++++++++ ...write_new_clinvar_variants_parquet_test.py | 87 +++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet.py create mode 100644 v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet_test.py diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index d07d8f5bc..98f859d03 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -363,6 +363,21 @@ def variant_annotations_vcf_path( ) +def new_clinvar_variants_parquet_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + run_id: str, +) -> str: + return os.path.join( + runs_path( + reference_genome, + dataset_type, + ), + run_id, + 'new_clinvar_variants.parquet', + ) + + def new_entries_parquet_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet.py new file mode 100644 index 000000000..a949d0e02 --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet.py @@ -0,0 +1,58 @@ +import hail as hl +import luigi +import luigi.util + +from v03_pipeline.lib.paths import ( + new_clinvar_variants_parquet_path, + new_variants_table_path, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask +from v03_pipeline.lib.tasks.exports.misc import ( + unmap_reference_dataset_annotation_enums, +) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.update_new_variants_with_caids import ( + UpdateNewVariantsWithCAIDsTask, +) +from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteNewClinvarVariantsParquetTask(BaseWriteParquetTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + new_clinvar_variants_parquet_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + + def requires(self) -> list[luigi.Task]: + return [ + self.clone(UpdateNewVariantsWithCAIDsTask) + if self.dataset_type.should_send_to_allele_registry + else self.clone(WriteNewVariantsTableTask), + ] + + def create_table(self) -> hl.Table: + ht = hl.read_table( + new_variants_table_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) + ht = unmap_reference_dataset_annotation_enums( + ht, + self.reference_genome, + self.dataset_type, + ) + ht = ht.filter(hl.is_defined(ht.clinvar)) + ht = ht.key_by() + ht = ht.select(key_=ht.key_, clinvar=ht.clinvar) + ht = ht.flatten() + return ht.rename({f: f.replace('clinvar.', '') for f in ht.row}) diff --git a/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet_test.py new file mode 100644 index 000000000..0ea73989a --- /dev/null +++ b/v03_pipeline/lib/tasks/exports/write_new_clinvar_variants_parquet_test.py @@ -0,0 +1,87 @@ +import os + +import hail as hl +import luigi.worker +import pandas as pd + +from v03_pipeline.lib.model import ( + DatasetType, + ReferenceGenome, + SampleType, +) +from v03_pipeline.lib.paths import ( + new_clinvar_variants_parquet_path, + new_variants_table_path, +) +from v03_pipeline.lib.tasks.exports.write_new_clinvar_variants_parquet import ( + WriteNewClinvarVariantsParquetTask, +) +from v03_pipeline.lib.test.misc import convert_ndarray_to_list +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' +) + +TEST_RUN_ID = 'manual__2024-04-03' + + +class WriteNewClinvarVariantsParquetTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + ht = hl.read_table( + TEST_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + new_variants_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ) + + def test_write_new_clinvar_variants_parquet_test( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteNewClinvarVariantsParquetTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='fake_callset', + project_guids=[ + 'fake_project', + ], + project_pedigree_paths=['fake_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + df = pd.read_parquet( + os.path.join( + new_clinvar_variants_parquet_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + self.assertEqual( + export_json, + [ + { + 'key': 0, + 'alleleId': 929885, + 'conflictingPathogenicities': None, + 'goldStars': 1, + 'submitters': ['Labcorp Genetics (formerly Invitae), Labcorp'], + 'conditions': ['not provided'], + 'assertions': [], + 'pathogenicity': 'Uncertain_significance', + }, + ], + ) From ccbc80d533726ec437f90b34982e3edc85797e8f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 6 May 2025 17:15:58 -0400 Subject: [PATCH 54/55] feat: grch37 SNV_INDEL export (#1095) * support grch37 * enable grch37 --- v03_pipeline/lib/model/dataset_type.py | 7 +- v03_pipeline/lib/tasks/exports/misc.py | 2 + v03_pipeline/lib/tasks/exports/misc_test.py | 122 +++++++++++++++++- .../write_new_transcripts_parquet_test.py | 71 ++++++++++ .../exports/write_new_variants_parquet.py | 28 +++- .../write_new_variants_parquet_test.py | 122 ++++++++++++++++++ v03_pipeline/lib/tasks/run_pipeline.py | 2 +- .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 0 -> 12 bytes .../SNV_INDEL/annotations.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../annotations.ht/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../SNV_INDEL/annotations.ht/README.txt | 3 + .../GRCh37/SNV_INDEL/annotations.ht/_SUCCESS | 0 .../globals/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../annotations.ht/globals/metadata.json.gz | Bin 0 -> 641 bytes .../annotations.ht/globals/parts/.part-0.crc | Bin 0 -> 36 bytes .../annotations.ht/globals/parts/part-0 | Bin 0 -> 3497 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 78 bytes .../metadata.json.gz | Bin 0 -> 184 bytes .../SNV_INDEL/annotations.ht/metadata.json.gz | Bin 0 -> 953 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 0 -> 20 bytes .../annotations.ht/rows/metadata.json.gz | Bin 0 -> 1317 bytes ...0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.crc | Bin 0 -> 16 bytes ...art-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4 | Bin 0 -> 521 bytes 25 files changed, 347 insertions(+), 10 deletions(-) create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/._SUCCESS.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/README.txt create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/_SUCCESS create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/parts/part-0 create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.index.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/index create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/parts/.part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.crc create mode 100644 v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/parts/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4 diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index ca41b5ba6..54aba85b0 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -391,10 +391,9 @@ def filter_invalid_sites(self): def should_export_to_vcf(self): return self == DatasetType.SV - def should_export_to_parquet(self, reference_genome: ReferenceGenome): - return ( - self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38 - ) + @property + def should_export_to_parquet(self): + return self == DatasetType.SNV_INDEL @property def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index 470e7cbe7..a79959866 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -89,6 +89,8 @@ def camelcase_array_structexpression_fields( ) # Custom handling of nested sorted_transcript_consequences fields for GRCh38 + # Note that spliceregion (extended_intronic_splice_region_variant) prevents + # a more procedural approach here. if ( reference_genome == ReferenceGenome.GRCh38 and 'sortedTranscriptConsequences' in ht.row diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index 5c98637b6..c0b8eca7e 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -15,6 +15,9 @@ TEST_SNV_INDEL_ANNOTATIONS = ( 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' ) +TEST_GRCH37_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht' +) class MiscTest(unittest.TestCase): @@ -25,7 +28,7 @@ def test_unmap_formatting_annotation_enums(self) -> None: ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, ) - self.assertListEqual( + self.assertCountEqual( list(ht.globals.enums.collect()[0].keys()), [ 'screen', @@ -170,6 +173,123 @@ def test_unmap_formatting_annotation_enums(self) -> None: ), ), ) + ht = hl.read_table(TEST_GRCH37_SNV_INDEL_ANNOTATIONS) + ht = unmap_formatting_annotation_enums( + ht, + ReferenceGenome.GRCh37, + DatasetType.SNV_INDEL, + ) + self.assertCountEqual( + list(ht.globals.enums.collect()[0].keys()), + [ + 'dbnsfp', + 'clinvar', + 'gnomad_exomes', + 'splice_ai', + 'exac', + 'topmed', + 'hgmd', + 'gnomad_genomes', + 'eigen', + ], + ) + ht = ht.annotate( + sorted_transcript_consequences=[ht.sorted_transcript_consequences[0]], + ) + self.assertEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus(contig=1, position=69134, reference_genome='GRCh37'), + alleles=['A', 'G'], + rsid=None, + sorted_transcript_consequences=[ + hl.Struct( + amino_acids='E/G', + canonical=1, + codons='gAa/gGa', + gene_id='ENSG00000186092', + hgvsc='ENST00000335137.3:c.44A>G', + hgvsp='ENSP00000334393.3:p.Glu15Gly', + transcript_id='ENST00000335137', + is_lof_nagnag=None, + biotype='protein_coding', + consequence_terms=['missense_variant'], + lof_filters=None, + ), + ], + variant_id='1-69134-A-G', + xpos=1000069134, + gt_stats=hl.Struct(AC=25, AN=1246, AF=0.020064204931259155, hom=10), + CAID='CA502008', + rg38_locus=hl.Locus( + contig='chr1', + position=69134, + reference_genome='GRCh38', + ), + gnomad_exomes=hl.Struct( + AF=0.026665963232517242, + AN=18938, + AC=505, + Hom=127, + AF_POPMAX_OR_GLOBAL=0.08191808313131332, + FAF_AF=0.02474386990070343, + Hemi=0, + ), + hgmd=None, + gnomad_genomes=hl.Struct( + AF=0.0001722949673421681, + AN=5804, + AC=1, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0005662514013238251, + FAF_AF=0.0, + Hemi=0, + ), + dbnsfp=hl.Struct( + PrimateAI_score=0.37232041358947754, + fathmm_MKL_coding_score=0.056940000504255295, + CADD_phred=15.880000114440918, + SIFT_score=0.1289999932050705, + REVEL_score=0.07500000298023224, + Polyphen2_HVAR_score=0.0010000000474974513, + VEST4_score=0.10700000077486038, + MPC_score=1.8921889066696167, + MutPred_score=0.3779999911785126, + MutationTaster_pred_id=2, + ), + topmed=hl.Struct( + AC=95, + AF=0.0007565619889646769, + AN=125568, + Hom=0, + Het=95, + ), + exac=hl.Struct( + AF_POPMAX=None, + AF=0.0016550000291317701, + AC_Adj=0, + AC_Het=0, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=66, + ), + eigen=hl.Struct(Eigen_phred=1.0019999742507935), + splice_ai=hl.Struct( + delta_score=0.019999999552965164, + splice_consequence_id=2, + ), + key_=1424, + clinvar=hl.Struct( + alleleId=2193183, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Ambry Genetics'], + conditions=['not specified'], + assertion_ids=[], + pathogenicity_id=14, + ), + ), + ) def test_unmap_reference_dataset_annotation_enums(self) -> None: ht = hl.read_table(TEST_SNV_INDEL_ANNOTATIONS) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 5be6a1453..bec8bbcf6 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -19,6 +19,9 @@ TEST_SNV_INDEL_ANNOTATIONS = ( 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' ) +TEST_GRCH37_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht' +) TEST_RUN_ID = 'manual__2024-04-03' @@ -36,6 +39,16 @@ def setUp(self) -> None: TEST_RUN_ID, ), ) + ht = hl.read_table( + TEST_GRCH37_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + new_variants_table_path( + ReferenceGenome.GRCh37, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ) # Make an incomplete parquet to validate overwrite-ing. os.makedirs( @@ -131,3 +144,61 @@ def test_write_new_transcripts_parquet_test( list(export_json[0]['transcripts'][0].keys()), sorted(export_json[0]['transcripts'][0].keys()), ) + + def test_grch37_write_new_transcripts_parquet_test( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteNewTranscriptsParquetTask( + reference_genome=ReferenceGenome.GRCh37, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='fake_callset', + project_guids=[ + 'fake_project', + ], + project_pedigree_paths=['fake_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + df = pd.read_parquet( + os.path.join( + new_transcripts_parquet_path( + ReferenceGenome.GRCh37, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + self.assertListEqual(list(export_json[0].keys()), ['key', 'transcripts']) + self.assertEqual( + export_json[0]['key'], + 1424, + ) + self.assertEqual( + export_json[0]['transcripts'][0], + { + 'aminoAcids': 'E/G', + 'biotype': 'protein_coding', + 'canonical': 1, + 'codons': 'gAa/gGa', + 'consequenceTerms': ['missense_variant'], + 'geneId': 'ENSG00000186092', + 'hgvsc': 'ENST00000335137.3:c.44A>G', + 'hgvsp': 'ENSP00000334393.3:p.Glu15Gly', + 'isLofNagnag': None, + 'lofFilters': None, + 'majorConsequence': 'missense_variant', + 'transcriptId': 'ENST00000335137', + 'transcriptRank': 0, + }, + ) + self.assertEqual( + list(export_json[0]['transcripts'][0].keys()), + sorted(export_json[0]['transcripts'][0].keys()), + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py index 36790b738..f1aac59a2 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet.py @@ -77,19 +77,39 @@ def create_table(self) -> None: variantId=ht.variant_id, rsid=ht.rsid, CAID=ht.CAID, - liftedOverChrom=ht.rg37_locus.contig, - liftedOverPos=ht.rg37_locus.position, + liftedOverChrom=( + ht.rg37_locus.contig.replace('^chr', '') + if hasattr(ht, 'rg37_locus') + else ht.rg38_locus.contig.replace('^chr', '') + ), + liftedOverPos=( + ht.rg37_locus.position + if hasattr(ht, 'rg37_locus') + else ht.rg38_locus.position + ), hgmd=( ht.hgmd if hasattr(ht, 'hgmd') else hl.missing(hl.tstruct(accession=hl.tstr, class_=hl.tstr)) ), - screenRegionType=ht.screen.region_types.first(), + **( + { + 'screenRegionType': ht.screen.region_types.first(), + } + if hasattr(ht, 'screen') + else {} + ), predictions=hl.Struct( cadd=ht.dbnsfp.CADD_phred, eigen=ht.eigen.Eigen_phred, fathmm=ht.dbnsfp.fathmm_MKL_coding_score, - gnomad_noncoding=ht.gnomad_non_coding_constraint.z_score, + **( + { + 'gnomad_noncoding': ht.gnomad_non_coding_constraint.z_score, + } + if hasattr(ht, 'gnomad_non_coding_constraint') + else {} + ), mpc=ht.dbnsfp.MPC_score, mut_pred=ht.dbnsfp.MutPred_score, mut_tester=ht.dbnsfp.MutationTaster_pred, diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index 285389063..f1dbec702 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -19,6 +19,9 @@ TEST_SNV_INDEL_ANNOTATIONS = ( 'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht' ) +TEST_GRCH37_SNV_INDEL_ANNOTATIONS = ( + 'v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht' +) TEST_RUN_ID = 'manual__2024-04-03' @@ -36,6 +39,16 @@ def setUp(self) -> None: TEST_RUN_ID, ), ) + ht = hl.read_table( + TEST_GRCH37_SNV_INDEL_ANNOTATIONS, + ) + ht.write( + new_variants_table_path( + ReferenceGenome.GRCh37, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ) def test_write_new_variants_parquet_test( self, @@ -163,3 +176,112 @@ def test_write_new_variants_parquet_test( }, ], ) + + def test_grch37_write_new_variants_parquet_test( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteNewVariantsParquetTask( + reference_genome=ReferenceGenome.GRCh37, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='fake_callset', + project_guids=[ + 'fake_project', + ], + project_pedigree_paths=['fake_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + df = pd.read_parquet( + os.path.join( + new_variants_parquet_path( + ReferenceGenome.GRCh37, + DatasetType.SNV_INDEL, + TEST_RUN_ID, + ), + ), + ) + export_json = convert_ndarray_to_list(df.head(1).to_dict('records')) + export_json[0]['sortedTranscriptConsequences'] = [ + export_json[0]['sortedTranscriptConsequences'][0], + ] + self.assertEqual( + export_json, + [ + { + 'key': 1424, + 'xpos': 1000069134, + 'chrom': '1', + 'pos': 69134, + 'ref': 'A', + 'alt': 'G', + 'variantId': '1-69134-A-G', + 'rsid': None, + 'CAID': 'CA502008', + 'liftedOverChrom': '1', + 'liftedOverPos': 69134, + 'hgmd': None, + 'predictions': { + 'cadd': 15.880000114440918, + 'eigen': 1.0019999742507935, + 'fathmm': 0.056940000504255295, + 'mpc': 1.8921889066696167, + 'mut_pred': 0.3779999911785126, + 'mut_tester': 'N', + 'polyphen': 0.0010000000474974513, + 'primate_ai': 0.37232041358947754, + 'revel': 0.07500000298023224, + 'sift': 0.1289999932050705, + 'splice_ai': 0.019999999552965164, + 'splice_ai_consequence': 'Donor gain', + 'vest': 0.10700000077486038, + }, + 'populations': { + 'exac': { + 'ac': 0, + 'af': 0.0016550000291317701, + 'an': 66, + 'filter_af': None, + 'hemi': None, + 'het': 0, + 'hom': 0, + }, + 'gnomad_exomes': { + 'ac': 505, + 'af': 0.026665963232517242, + 'an': 18938, + 'filter_af': 0.08191808313131332, + 'hemi': 0, + 'hom': 127, + }, + 'gnomad_genomes': { + 'ac': 1, + 'af': 0.0001722949673421681, + 'an': 5804, + 'filter_af': 0.0005662514013238251, + 'hemi': 0, + 'hom': 0, + }, + 'topmed': { + 'ac': 95, + 'af': 0.0007565619889646769, + 'an': 125568, + 'het': 95, + 'hom': 0, + }, + }, + 'sortedTranscriptConsequences': [ + { + 'canonical': 1, + 'consequenceTerms': ['missense_variant'], + 'geneId': 'ENSG00000186092', + }, + ], + }, + ], + ) diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py index 45d84e289..7ffe7c5b6 100644 --- a/v03_pipeline/lib/tasks/run_pipeline.py +++ b/v03_pipeline/lib/tasks/run_pipeline.py @@ -43,7 +43,7 @@ def requires(self): self.clone(WriteNewVariantsParquetTask), ] if FeatureFlag.EXPORT_TO_PARQUET - and self.dataset_type.should_export_to_parquet(self.reference_genome) + and self.dataset_type.should_export_to_parquet else [] ), ] diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..c1cb9e1cab5873fe48eb8a0ff03ee9b638156eab GIT binary patch literal 12 TcmYc;N@ieSU}7ki`TP?A5`P1> literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/._SUCCESS.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..27052adc0642ce30dbdbd2868874e46cfe40d226 GIT binary patch literal 16 XcmYc;N@ieSU}9LD{6%thSLsRsCJP0T literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/README.txt new file mode 100644 index 000000000..445c1e13a --- /dev/null +++ b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.133-4c60fddb171a + Created at 2025/05/04 18:49:34 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/_SUCCESS b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..f7194790a00342550eada4a668dd55976d68f913 GIT binary patch literal 16 XcmYc;N@ieSU}9Ld&c$;&H_KfBBE0>xl=dDst^@>bt}? zwo|oK`R}arWjhHc=u>ufckImWG}lCs3X&}Ot{G@xk3SwZ3g}IeCix3ol5D%fCn_0G zhbP%}Xn49PV4ubWR)K>s^Jcvw1~yjL)Lu`Z4VS`X5z6+$9F)VjAo`Nhl8Y139w>yY z2+QFtD=;25s^k=M!aW3MLVaYVs(|_USX6ArT7ZaYfg?1AO$wr}CO0-xTVT1>L>Q{M zvM7o$@T&$vVN$#2(xMaye$<*=wgd4l4SI45x>_hHeo7yWj7DikYve0IrTuR zegE8R-bbz0G1nmG(A$dG1%r+L%|H|DbW(Ig3$}^Vfn}n^XqQ)vGHD1Di|rAg7+Yh) zF_U>}h`(E5!ZeH`GC;ig(b6wpB$!dS>hmSvfZa}3yPs>c*gdwi)(cx53pehx-{^gP zIDnpfUB0eY?{7Y=d@4=ok(VqeS(TE$AZb*zci@Kq#)A91yW9JD+fQRz+xnk<_y$0# zX=mep_!5d}`tb5>smNmnFl{SR~ bfDv1}8izEFjaaLV>uLS~FMw%ZxCa0L^w=+| literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000000000000000000000000000000000..6541f64d2cf041d415751216b33e828366927654 GIT binary patch literal 36 scmYc;N@ieSU}Ctk*Kq3wBmUee2h119ZQ%OJ(KapFxG|vM1wJ-f(pe;3Q09viIAyDv;wmB>-+Atah<#+52ygZObqRrSF zc;%|kgovJQL@IfU?EaD7B#B(oyyqtHB!~?FKMOPa^%gBHE!iUG(w|}}IzNGF{QnH0 zd&<~x76lOk2LcNMB2+kjqT(5jM~RJ$Ov*+Zp@Oo>WJW|d9TF838CnX7#7?+m@s7Y0qN&L6D#SmBdwwVA{V;>qF7kyg zz99{QB1Og2e)+fmeh>UGz4ie5*H-)490C&{;9}c-Mlm<{?5ag~ZT{^|721C9eKnC^ zU+dR?>^!&b*cy8wHfKHhznM9ZPQ=Ca#J@k6YCTYkOmCaM!4L!H&G|R} zym_(R^s}vYYq7m>enMP3KU~2uYgdPKo0N<2<5r0 ziyI=~^*00RYYgH{tq08b3wan!(J^nP*Dna)6%lsd8KS!u1gDq(gFmSC;4*d{;;^&Z zH%WAC&}ORd%-id0{S;^L1ozc9zMi7VxCt8$f=Mo^^ z+AqEyp-I1b{XK2{&tKoy7hjRgMx{eYEgoI{>rlP6LC~*!+M+BO`-+9J4 zBq201%8TrfNf8yJN{UiPyvPn5mROY0iHfkfVNpd>G?#^OeIsh`E@_blfyGBbSIdPy)*%1EuBsZwR>Qx^Tlw`f(NaOej) z_OSNtbBSG1``CEjKWI7#u6F;)^&(fOMdrz*)Ssw%fd6(>;SNp*k9P#w;gAT%;2dF=L7v@SLwC%@k>pDMoF%79V#2gAsvDQ+ZgZL8<$g;a+h&f#Cr-?c7PQ`B65|G8uz*GxJjm}~N6 zG!91Z*K^MrQ5KI4LI7a~6^@P;HduiM0YDhR1rRtvh-ku@3;6pX~KE8{YWUQ&uKX~Lq6lBh_U zbBf~JYUSv(CY&fqonc=MAdbxb5uF^lm6gENYo*qn0XW{u=FvuJJ-6=j)JqiM@7 z>YB2RqeBWecu+!&6OF^df(ja-;KA8F+OFhqJB49U`{*vzbJ5vD9oxKJMme%c@hXd! zjPU|PP13BWb!AqTMH@A@Xqzr7q8a@NtJFj}Oh%tW`q~1HNt)DR>Q=;}hQe|e{a|S0 z``ty$U?wSrLqp3{+*VE1MqN>2(X_T4D>_I)hU&fLJoGD9j1feK6nlL%?*2<>&|{&>vS1o9Du{a%JgiEikgZzQ5?=CQ7&w^`3-A$lvLC{IGoO{ z=kZE6=ojmMuk)&!IGh=mG(~Ly4#(<=q94g-;1};ZKgT5f8amJHlcc0bMa{(wP?$|o zYk_@OET*W5tbo|#8GE6*dHs)v^$q2TS98oFdnQCsm@TAux)Rjs7ru6O3`t7kpa z6x!|aXGznY_TsIl-!eHxO@;oZy{rRxGqh_0d~fdz$B z^=AFbDh_As=X^vdY3k8Jv4CJwl-~qhR+mSVs5pJ*=I_mrQ}Hsfgjh04Ns7`3qpc2k z^G>U>G|P$l8B@nfEFdAzev@-50?WtB=c)ari^Ni5e;4_|Z2jA>s)`z>gk;7Yk>ZDA zsXBr(s3Pi|NptS%@NS`IXMx&R6bo1u!9HZ(v;&p@+d!;cXQqy_H{!2YNNjVD4FbpuC7S2^i0IGCU3tq{>gB)=R=Uet|888-FmwFEL|LKC? zU`SeVe25pUv1%i~MEgichl!$T%u;1~abp4bp7wC$g?>~-RmOqZP01*k!-tiuR|I&8m2Zw%4?{@pamglM5G7cW;0l;S*NW#PM%WM$afqAtC4nqlZisi~3k^7V0*&3F zR0K{5^avu*j@mq$EWA9g9OCZBNNElmf657LD*m1Gh^?rEX;_j2;GWqS4b@qMqS*yU zNn2~4(ZIhB0SX3<$Z5nPkZb_)>qLL$Ux?bVfbBd*hQ;>=w-o2+V2bE-!>tuSrs!GQ z01p`qipOh2o97kdtuyFzK~TK|w6N>>ieT4}=~Q$uffie62JPMXh-3>20CFlasUWc7 z_*STSm}sfO4uUTNAd%l%io5fwFw$YtnMb;?5lQXMpQrP z=7>zK4F?&9>LbhxvB1Yd^m@z3o)G=wSeJ zAY$}PNU8_J5Ey0w<5Y7dQScO6oL%1nR!;r3-a1x5@-Q?3kTGefI70~4Q9QdBlfBqm zB!4raoh@SDXGm}c?(r$XpP+srhQm93<^4cGH5%4AxI_MlYURSbD+XncD0#9w|Gc_~ z+2YPlar$K>`5Cd}+J2efgXR4ct96Z9aO^V?|I_;(24N%bHkSEXj61IK?IiK|m%;E( z;Mr!F?80f_#9yzu3n`g&`0jewGp)`Ug{ES73`O*GVGU+yuMr+Rdv*kLCmTvl0s@iR z;f8ST5V$KAAr1+{-3#L}@dQQ!fG#gFrOgtob4ZmV+w%h5L8ro>Z43Yr(+@$KFZoyU zKtq9J_cz`kF!LJ;kQn8>NYh`-G}%-a1Bd$}5|$7G?6itU0ztO$u%`wSZ3vlAg_jle zi9k<7xpB@eTZUF9evy>k2JSPVJ3Ne=dq%;`WD^uvFrV%^OzQTNEf7>Cz|jT(2pzSG XoFWVj000000000ewJ-f3009619w426 literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf0257b60e40a2d0e6ed431d6ed9407f854139db GIT binary patch literal 12 TcmYc;N@ieSU}9j;InoaR5cdNC literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..751197244d14948f4cf8ae64332476e45b29e216 GIT binary patch literal 12 TcmYc;N@ieSU}DJp6)OP%666CD literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/index b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/index/part-0-ca83e9ef-60f3-4c8e-8ae5-b90998d2d0e4.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..4ae215d83bafd98f365bcad69db96a246fa1968b GIT binary patch literal 78 zcmdO7U|^5|VvVi(e-&gN0$B`9jD~%4SeO_X9U0vj7!6-90~`(`_+jm0~LXuJbi6j{C!PzL2HZCtZc75M-%&(?(iji%8Aawg!7>6ShS z60Q{5iaP77Hp=6mll^6@=gX;3)}qBIHzBC>;35;BJqS?Fc~Zoa|BxYb2TXC?Fz$ul mvm(uf(Q_KSr&C==f;tQN_{~y0_Z~Q1ax6k4mT&75UQY zmf4Se5EJNfCC$2V6qi-T1eb4A2eOi?rX_)QRf7pk(+JLhP#zfYPFXlx)znZ1YE#u^ z;;RA~I|Gi4J<(+#+q%!pC~cu6R#R!1=8Yu`^M=1$f7hV3;>z7NFSMrHmz|^6EYc(v zbe)KWS`+L8N)82GHk89Te`qbWSgj{&P?t0sua+1@6>}EU7|_lniRgNz@T#0~yG?x@ zmDzdf4h!uovSUP-OFe~F>>Lcu*wLXKj$#5+{Y4^Ck9LNAmps{JZBtU~6u`a&e^dwq zHuSa8>IX1Os+RW?prNJ>X`tkl1|S=1Hu+TA@p<=})yVUuPIAg>`cD3WE!k^kc+Ovh zVy($AA_y?AnN~kTa$b_p-2ZRi7L(2R>Qm559OD}Fnf=SC^@hH~HJ6G|#>;+)8I=k< zi%aH{#OW*d8|DWeBoH3$#G7F;(Jx;Veo)`fx8nrohoengBD)N|!fVD2aRV-Cg}?l( zQUa)qX8yyIPMsf+`^H_Et>mls)OiNQta2cnqlb1D7^hr9evDC&+IRfk?pis?GbeN{)QMK>V1fSVafb){!si$9v9@>?D1+SAP7`eHJnuomNJ)jIb z>=1E)KwONz+`rn1|2M%y6q;*ZY6-!pW(Qcn0q3h6gM}d**MS& ztmX9XDbZToKHts~-du@ovjKTd?w^ZAB7DAGK3ycdhxsJsw04e>kGUzJMx{Q)t6|Dc z)OslKJ%Y~OK1RR3@`G*UW%EAF#jUzpFu&t zL$3Py-|f6I8Z{JC_DTD4#17^`H1U$uSE) literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..47a0a0400dcb0547284b519bd587bcca1099ce89 GIT binary patch literal 20 ccmYc;N@ieSU}AW-XjS*Wq@uQye^hP&0870I5dZ)H literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..3a2611398ec70a55a019a1812c0d284e6e5fc6c8 GIT binary patch literal 1317 zcmV+=1={)_iwFP!000000Nq$^Z`(Ey{xABpVshu$iR1X2?Kq7ww=wJ>7#0`;EmAgH ziPA_qh!^O;@2D4wl;mV7&<$u26z1_>dB^9G^x`lS5ohGg879vJuO6<~3GsW5>v-SD z(HT8DVeB6StWg+}kO*f3TYij*uN0lpOX=bMm=HYG8=ys$2zGv*O^_gK$q&AK$=r=B zksmWjcS=SX;zu`$1(h^rAel4S@1<@L8w6v{(ebg++_+p&JSy!(%q78|I1UM7u95{b1gjRKbmKlDP7$-> zKf?iMkq77QPI^KjsmiO4Wjrkt8-LNiZbq}4Fh=V4K8{9ZYo_D*wf4++{h^n4_lR{(OcdvKacnP zMLtZ7P$}xM2$2v~%XG>x4*yTic*{hPSaFw}Mo5zJ+!KDxNo5!yxsM|6^Y06Fbt&7` z6{h3sYcJVzf-Bj2esf>fTg@J3i@If$@Tuu-?Spapw?t}BqhE|B%}Zjh+4Xsgc8-ytWY&k#W*s+buYhpaR2>~GJpts z_%pPfo9GmPLkV-Ugh7fSJL?j_-BuMIU%B$K9!pq>hXvZ!gH{{Q9EYP#=Ve$dR5dj&-}go1YR z#apbS4eNYUm{)XTu}!gCl1;J9xyD!|<-%r~LTC`pD?F2D*vc{|NXE3VDW(XVF-(nM zs@tilMc-1BSWI3i+!?fDbZ7XXMjI9S8kb&UZ*}=kBX13OE!;J!qQMenGfR?a=ZOC# z!Gae~P{Lb`&_W0;e9%S*ud+cK88o?|oeJJy0-FdLQ`U0U)e$SmHsQa4{08vXaNmM` z3-xW#w=my={1&ut0s9u5e;wvcl(#Xy4)1TGyMK_zA)k|2dhIuK=-pM!#Rg*ye@#Alqx{?A$2icp~zj_@rxjLX+DByn9 z4+Jnb)q9r3>##HWaj$Zz8wY_PQYV5Ki}MR_he*>+8+lii7PtbKyAfmII@#H%&CIfuFAN6%ayg%iM4+@8}r$50=DHG0j$ z>t)xAe*d7hm2#qxpK*5K(7y9&K0AZzbZjrFqnsy*%Nwe`evKX-wdiy^J{tDP6$v_n zZqVn!wF8|6ME4 zvkZKWxk*Kp3ht?SsU?}o#f-dp`6UX)1*yrIX_=`h41C}adA-P;(K%Q@gpobby+l9J zxkSp+z|`H&SvEMe$k@OrB%{dE(p1-7*ZK7}#zRaR?cUZ5md39au`r0obXZMmXB1+z zkQA_fYHVP|z|dl2Yt>fH%@lOB!|M36%a#lbj1i1S?Icf~wB=ay$6?-kb^CShiMD%6 zgl%?t^4Lu=J893vkYafxAVqPym{br$^MO-#Ku;*!GOXfe`nkD}he4Qufop@@mdz?n z-Ht^Y_%7%@Pru0ag4^I=@3%;{m)hK|=cP}uw1(9(3LnVObztLVfB;auD=+}#7yy+s Bo~r-= literal 0 HcmV?d00001 From c25dbc6cd07801df77ac474e26ede998c6317346 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 8 May 2025 11:15:49 -0400 Subject: [PATCH 55/55] feat: alphabetize nested field (#1096) * Support alphabetization of nested field * better sorting * Update misc_test.py * ruff --- v03_pipeline/lib/tasks/exports/misc.py | 6 ++ v03_pipeline/lib/tasks/exports/misc_test.py | 52 +++++++++++++++++- .../exports/write_new_transcripts_parquet.py | 3 +- .../write_new_transcripts_parquet_test.py | 32 ++++++++++- .../SNV_INDEL/annotations.ht/.README.txt.crc | Bin 12 -> 12 bytes .../annotations.ht/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../SNV_INDEL/annotations.ht/README.txt | 2 +- .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin .../index | Bin 0 -> 87 bytes .../metadata.json.gz | Bin .../.index.crc | Bin 12 -> 0 bytes .../index | Bin 87 -> 0 bytes .../SNV_INDEL/annotations.ht/metadata.json.gz | Bin 1326 -> 1322 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 24 -> 24 bytes .../annotations.ht/rows/metadata.json.gz | Bin 1817 -> 1829 bytes ...0-25098be4-300e-4383-8199-08163e61c02b.crc | Bin 0 -> 16 bytes ...0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.crc | Bin 16 -> 0 bytes ...art-0-25098be4-300e-4383-8199-08163e61c02b | Bin 0 -> 971 bytes ...art-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc | Bin 939 -> 0 bytes 20 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/.index.crc rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx => part-0-25098be4-300e-4383-8199-08163e61c02b.idx}/.metadata.json.gz.crc (100%) create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/index rename v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/{part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx => part-0-25098be4-300e-4383-8199-08163e61c02b.idx}/metadata.json.gz (100%) delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/index create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-25098be4-300e-4383-8199-08163e61c02b.crc delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/.part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.crc create mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-25098be4-300e-4383-8199-08163e61c02b delete mode 100644 v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc diff --git a/v03_pipeline/lib/tasks/exports/misc.py b/v03_pipeline/lib/tasks/exports/misc.py index a79959866..08f9c570e 100644 --- a/v03_pipeline/lib/tasks/exports/misc.py +++ b/v03_pipeline/lib/tasks/exports/misc.py @@ -27,6 +27,12 @@ def camelcase_hl_struct(s: hl.StructExpression) -> hl.StructExpression: return s.rename({f: snake_to_camelcase(f) for f in s}) +def sorted_hl_struct(s: hl.StructExpression) -> hl.StructExpression: + if not isinstance(s, hl.StructExpression): + return s + return s.select(**{k: sorted_hl_struct(s[k]) for k in sorted(s)}) + + def array_structexpression_fields(ht: hl.Table): return [ field diff --git a/v03_pipeline/lib/tasks/exports/misc_test.py b/v03_pipeline/lib/tasks/exports/misc_test.py index c0b8eca7e..d04fca759 100644 --- a/v03_pipeline/lib/tasks/exports/misc_test.py +++ b/v03_pipeline/lib/tasks/exports/misc_test.py @@ -8,6 +8,7 @@ ) from v03_pipeline.lib.tasks.exports.misc import ( camelcase_array_structexpression_fields, + sorted_hl_struct, unmap_formatting_annotation_enums, unmap_reference_dataset_annotation_enums, ) @@ -86,7 +87,25 @@ def test_unmap_formatting_annotation_enums(self) -> None: existing_inframe_oorfs=None, existing_outofframe_oorfs=None, existing_uorfs=None, - fiveutr_annotation=None, + fiveutr_annotation=hl.Struct( + type='OutOfFrame_oORF', + KozakContext='CGCATGC', + KozakStrength='Weak', + DistanceToCDS=41, + CapDistanceToStart=None, + DistanceToStop=None, + Evidence=None, + AltStop=None, + AltStopDistanceToCDS=None, + FrameWithCDS=None, + StartDistanceToCDS=None, + newSTOPDistanceToCDS=None, + alt_type=None, + alt_type_length=None, + ref_StartDistanceToCDS=None, + ref_type=None, + ref_type_length=None, + ), fiveutr_consequence=None, ), biotype='protein_coding', @@ -520,7 +539,25 @@ def test_camelcase_array_structexpression_fields(self) -> None: existingInframeOorfs=None, existingOutofframeOorfs=None, existingUorfs=None, - fiveutrAnnotation=None, + fiveutrAnnotation=hl.Struct( + type='OutOfFrame_oORF', + KozakContext='CGCATGC', + KozakStrength='Weak', + DistanceToCDS=41, + CapDistanceToStart=None, + DistanceToStop=None, + Evidence=None, + AltStop=None, + AltStopDistanceToCDS=None, + FrameWithCDS=None, + StartDistanceToCDS=None, + newSTOPDistanceToCDS=None, + alt_type=None, + alt_type_length=None, + ref_StartDistanceToCDS=None, + ref_type=None, + ref_type_length=None, + ), fiveutrConsequence=None, ), biotype='protein_coding', @@ -542,3 +579,14 @@ def test_camelcase_array_structexpression_fields(self) -> None: ], ), ) + + def test_sorted_hl_struct(self) -> None: + struct = hl.Struct( + z=5, + y=hl.Struct(b=2, a=hl.Struct(d=4, c=3)), + x=hl.Struct(k=9), + ) + self.assertEqual( + sorted_hl_struct(struct), + hl.Struct(x=hl.Struct(k=9), y=hl.Struct(a=hl.Struct(c=3, d=4), b=2), z=5), + ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py index d3f96c518..2076f0437 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py @@ -12,6 +12,7 @@ from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask from v03_pipeline.lib.tasks.exports.misc import ( camelcase_array_structexpression_fields, + sorted_hl_struct, transcripts_field_name, unmap_formatting_annotation_enums, ) @@ -69,5 +70,5 @@ def create_table(self) -> None: transcriptRank=i, ), ) - .map(lambda s: s.select(**{k: s[k] for k in sorted(s)})), + .map(sorted_hl_struct), ) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index bec8bbcf6..96431ee38 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -135,7 +135,25 @@ def test_write_new_transcripts_parquet_test( 'existingInframeOorfs': None, 'existingOutofframeOorfs': None, 'existingUorfs': None, - 'fiveutrAnnotation': None, + 'fiveutrAnnotation': { + 'AltStop': None, + 'AltStopDistanceToCDS': None, + 'CapDistanceToStart': None, + 'DistanceToCDS': 41, + 'DistanceToStop': None, + 'Evidence': None, + 'FrameWithCDS': None, + 'KozakContext': 'CGCATGC', + 'KozakStrength': 'Weak', + 'StartDistanceToCDS': None, + 'alt_type': None, + 'alt_type_length': None, + 'newSTOPDistanceToCDS': None, + 'ref_StartDistanceToCDS': None, + 'ref_type': None, + 'ref_type_length': None, + 'type': 'OutOfFrame_oORF', + }, 'fiveutrConsequence': None, }, }, @@ -144,6 +162,18 @@ def test_write_new_transcripts_parquet_test( list(export_json[0]['transcripts'][0].keys()), sorted(export_json[0]['transcripts'][0].keys()), ) + self.assertEqual( + list( + export_json[0]['transcripts'][0]['utrannotator'][ + 'fiveutrAnnotation' + ].keys(), + ), + sorted( + export_json[0]['transcripts'][0]['utrannotator'][ + 'fiveutrAnnotation' + ].keys(), + ), + ) def test_grch37_write_new_transcripts_parquet_test( self, diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.README.txt.crc index 929848c8f70020fdfb19c695a3d4b60115577883..6660c151995d73af160ebf4d7c0b7d98b456a53d 100644 GIT binary patch literal 12 TcmYc;N@ieSU}E_BV44H~6-WcZ literal 12 TcmYc;N@ieSU}D%J7R3wz5lI4K diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/.metadata.json.gz.crc index 04084645f54cd62cdbaa7618a71d2e940992ecf6..16644fe9d5c8e393f4d284049388f82d85f14073 100644 GIT binary patch literal 20 ccmYc;N@ieSU}Ctx$Yw3S)}5REr|RYc077L52LJ#7 literal 20 bcmYc;N@ieSU}8v8&-985s5^dNbXfraHNOUQ diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt index 9d34ddd04..ace5a0d1d 100644 --- a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt +++ b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2025/04/24 12:35:20 \ No newline at end of file + Created at 2025/05/06 18:22:47 \ No newline at end of file diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..481fe6ff8876565470ec2dcd58831b50eeceabae GIT binary patch literal 12 TcmYc;N@ieSU}88a^l35x6Hx=3 literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/index b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..77676e354bf0e52c11e98ce10a4edb5614e97bcd GIT binary patch literal 87 zcmdOAU|>)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJEP+YUIm~UVVD-- MnG6h!KwW5h000CL{{R30 literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/metadata.json.gz b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/metadata.json.gz rename to v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-25098be4-300e-4383-8199-08163e61c02b.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/index/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc.idx/.index.crc deleted file mode 100644 index 3704b42b8edd6ae577ebfa629a5a2f055a90eb14..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}89XB)JVvVi(e-)Gj8G%eDmgI~g!;dR0nHU(I8ABMr+)rSxJELOX9u3aSc!ABzY800000006BQ+fL&+^k4e4BV|VeMKJ1Hr_ccbOGPVoC4?-K zoFpC`J8L^kJE;G@$4;C$X}M^$5-Rbz-+lZtN*M?83`#R8Mw8={@!{8KlvBn#;ONWe z!_Nof@t;TW@s~779uH0q=qMT~NI`*!K@$a>l+@%%+(D6l#L!0+Z#E?$W@QgA8JCZg zPnJeiv3Y5fXn)qR0uc7ZL>48FLZ_ua#hi&}s{An%vY-ipwJbmnfIz`C0=kE&N;A^bhsTVb&^6jZ~XAr9Cpao7m`sIxx5|0&9hp=6{nNahVzbF9J^l8y)F} zCRA3rkjA>`sZ;h02!A7)TVyN~YOIl9?tMs;tN|QO?2v*tngzS)?e2Q=;wI;ul zq*kJ9#DWP)Xv~s!KExDTiD77J>oIRhGw$3>0Re;sD=x~amMP5>Wc+I(_PENnDnI^Wj1V+SS^siFmc5Ghrvn^?3+Ss9tWfv*DKo<{7_=aG#a zfj4eiG@-qezv$CU3WF~8VE7FpGr0$zqJD}BR&sW_Y*fr>*@2gaD$^iBs4N@c*)vPr ztY{=oxpCP3v9G7W&22pXW+o3{PTyW?0Y8_ko5fxn<%Wd*`!nG^Z?PK5>k2G36ync+ zeIYmnAEKr*f?{5m(&g{?D9Q{$>873J(^(U$>0kb1?khXWW#M~eHoZ7)=SL3l6V_qc zR`&;Qrru!sjppAYzjYEFfB&NG8SuWozNApqM^=(wM8{G=Hz&e{g2rf%ZaSpiByq!N zgYn8P6y&DMm(SpNoh?lAbWRpGi|grsKjh|?e80N+Hoa>3=7n<;8O+geZ9}aI+qWpT@1Wi3 zvfl!?OOm>gw_AL##J->UsJJ#)guok|^HV^;+OI(O6 delta 1317 zcmV+=1={+m3a$!&ABzY800000006BQYfs}i@W1rat(03W55cP6x&;;l92Kp&lMu2@ zGD*BRcFuO#c2WQPjh#4g((usgBvdl*muEbFnWT&Zc?PAK6qD)E@xlJrXp&RLJLKfc z=l#!n2M2#1#z$Y$BzfFB-lLOfq96qYA_h$qa8go}BXfs;MHWLJQN7uefRvR5UNSBp zDW5Kls$%of7||Zqu>uelVj_!@N1@ZwLosLKnJN!vLKZY3u$Ber0a8doVwoTY{|Rjr zWx{wKU2+x!q0A#-O=JIUWLZGsp*$;+f$9uIXb@P|`n!!lRK-Xv7E+L5pe`5FnNoCf zUkg2KgD@?B<8@`ILDm)31{5h3x-$?6GGQgC)}X9NpD(f%k?0 znyZ&W7i*2Myzp;j%RjW|lUyt4(pHZZv_0?!I7&Htu5;xaV=UIjt~8y)$E zE>zaIkj94Sxl{HGh<~G)TNEr4YOIrB?tMy=tN|Q<(+gn^_O}mFfjD>Y2epq1x+YJs zAq{7Q-_v_8<4Q+p5jg0FNXegFut=F++223D-_G-cub({MK(WdWj>!)D@uvu9QtR?N zMQSy=Ml6_+gvKms=R-`fl^CX`_8#k&G~>?A6c9jIFm9f890Kj^3(R-y1x0C%fyJhW zDtV=UiJcM7ZbNB~krnrz^iL46Uf1C`h6ij}*|ruF39K7QgXe|&YT#T@A7I040j-hI zI4Cc3OdU=!^vG-K53e&WsngRx*h~|9wgt`bZ~9G24pcr_kFI(edo{2&fQ~1m z4u-{~nP<(=##zfK&cDXMc-z35X=4W_6RDzq1rRA!s+(N2L|GY`zCo`7Nf2VpUoTnv%mbunLpG~E(_l#^V!8oJ34ZR zpYRB#ZAL$MDfMR3Z}k0${x(Pq{QWz&=fL~+`kF#hAK6BJ5gkhf-G~Tx2^yn6x@nGj ziNxiiO~xy0P*9t$UcQ3ob+xd_voo@PxLI7!{vkKF*j?|?q z$2k};A1=6Ul{(o#&791VA8mP$hiku+1e;>_qk#pROJYX0S%XjSa0PY-~qt zpFq3QW!wX|Mv}Ubw_ALw#;N66;*W0_3xo26H5W5F(`|9=l zTg1wrAD45?C{*Hge;a&s)LQw|Y literal 24 fcmYc;N@ieSU}D&;$`TZFR!#rKw?B3!LjFhtd=K4|CvWt_i?d)|z!=9diAmwz z!c>rjBoK<;=%ul+c*;pQQ4^p=ga~HdPfu_`=A5kq{!*D68GoW6OF3N%8BH;J^7E{q zoMtIVE>b?|m%j83f-z(G>4((3_1SXp3DqX|v&ae>j+*HrW1NJDGn^IyqdA8J(}MiF zCTRd(mDAP%PH37T9MG^(Rsx)6DGhKus-5_-N?=CKFG)&}=5@7vC<0@YtC1Fp_C$ig zLIEoga8s7Y>wf|TF_rG+)E&(@>_BCqiAbe|IAIB@;+N@FDz`p)gmuz$rJ^(%3q0d0 z!P2KGXRp_0Bm(cqog0qx6|8De5D3w*m}9=mVDV@``O|267H7CJ7iSSC#4IIRKs7`t z#Y^}LR2Jf;C*WLcMpW{*5_0)GjGsLqrIsMbZbGJ3#_;*={- zrN0*GrI0tz*vK?R3r^D|n2H!28I)x#Dw>s2v)P(w(Kdy(E{*5|0grmUYTMn+%78lA ziRx$%$=B=|-%sG=K+={J^hvtpt5$pp(O~el$R?-r=2-a?oVOEmju~%4(ZyNbNT&}p z6fvtR7=OpSo;Sy)w`TH8tkPeUuXJJM)vceB|K^LUnR^n)998RK3s79A-pHYf(H19S z+OX3IEIVac-ukk9^mFhCy{cNCqoKd#2wZThU5+Qs@{PZ%|DCCW;jgkp9l6Q);BuQL;}cBABlk+1Uc$)_+|bj@sm5MPxil#%c8GP+sHMv}pKT zrdfhR1l?O)Y~qXwgK&=F(3?yDjEW#*L~BB;WfGb?IDjY%TD3*F?lCTkcAd(`A#~b= zm=~A6kJ|349Y2?Q3&k-$L$j;dxAEWT>Kc8yygC_Qw$GlytZwU^B-C~fE%nX*tCD-G ze}8EHgjlUe_ET9=-4-3*rA3q*vCbGxFel>+Yg7CPE{r5W-@aZp`)Mgm#;2z!UojH4 zviZf?!q&T<-b^oT;Vg@v@)b$@==^4UZO`6J=Zm2&{5G3(L~A|+{|<50rx&;omsKuQ zw8DW_n24a=-N3nb+K!zQt}Ky999TOqw|`SJq8pr`arm<>2-F&@Mp|8nIknpmiS}S~ z)gnv{=OKwXw%0B$L|zo*DgR27Q|p|P5slmnxSj0T?nl#+VCtNZ`s*E zXF1!DvmDmotPS#JV4|fI>eQ@@OlX!f+O4+bQmfO^f!Uy69ezzx-lgV5WEq-C$WA!6 z+1N?N8=zRv$wJ~iVc5;W+c9_-1%Epscq;>ML|`3$<d1vwx@!w>q)4 zB}NT86fd^$v6)saSTtDCjjDV2+(4+W@G0?h1DKj^Qex=_BHaa~4jy&jsEbCo!l;Wy zyC~`=(H$Uaa%ktkI`*aA4nd4MeRjZ8<4y}Z4eB(Y(_~HqIXlGZfX(f!X`rTqGCPoI z^W{#wG){kq<#epK=L@8+cs zbX5?p0@?J15H4@m%R`%qmWNTxBL02fG*ncbn5<}illp^0o8tI8l+D}fsq)9FFDZ@+ zstQ&BJ|MKY+1)yDJY$nRLYuxM}m znozAlyE>+o(i6sWcz@wIqvEH=(GO1y4tfXukG;>wcVu`l==I3KaBw_0I6nOR`Ji`v zcr+kKhe5A@SN$^i^BSj1aspRoD4u`u3i19T603j5g#Xkf3|@m3JA`nW681)iy* Q6W+@5fA^7LYpNXp0F*9{r2qf` delta 1814 zcmV+x2kH2w4w(*+7=PyCCavEb$7zD5jS(B!#TEg>+DeL-@946n<@Lm63Dg}yREG49AP zSSVp9B4NvlbbnQ%D5c81TDqr9Km}?GZA3aPBnZz?9luJiR=EqwBkYrzE0wL$S`bX= z1gntdg1=mwlLWjYe{MJ}mawa3NgzbaVu8hy!RFD3il^b^JY~2xm$F07yS8X4Xubej35RC}F~*a!-73AxR)Qs)6jr7*p$`OHnhk5zyIoZgQKwe2 zJ9Gg)v!D3K7#$p89<(o7??J9w zR$75=rK+mCP<4&j{*N%CYL(+bxDW^&2xm%;#;vQ-SN*?p{XB$Amgy&LJi0vV=F*Zx zKcHa~X-{pnnt0H~t5kOC6G{jcD^9jfzSj*EkAI^sdDsyZPsRjIUL49x9NQKxpNpJj zI7TqE<#{HbENKuPUwGE-t^bM2h;d@fp5-EoZ5i~)<4XA zLVuk3s>Z15sQzS*@AEP#tXQX}EXrBjqJfpa3u(3j6wZ9N@@@#yRf6-!RyPByzd zpSyZDliSIaE1a_QsaTTy5MA7kZrs(|$!tDwgfAIDyLz@G=DfoqxgH*5U4X&Um1NN*L1ZZ66419ghiNI z&SR1a?Cyn9^&V)D0EPawj7+4&DQgu<7=kftX*LvjX%4Oge91#~YA^=vrp?>69AeGC zRXs482w|?UG?wRtKBLRkeMSWo)|g_7Um=9JCe!{B3=hKPl5ec7iKPkw+EEkB@qeI_ z5dNSb+`{iG`R^Z}ZpO=_4{tPGH!eUCHpD_}L;d7UgU*fCHgeV$U1nCdcgSoJvs!I| zSzR`~Y+MxWg-w)7Xu`57vRPTJ7^m8XL#;{2s-W?mD#V!mje* zWnB;IDs>NsZU9|}b9H0cn7fg39mrKCT*mbxuF|au*8$w_Fwgl(O{{H4ho;&c znbw*~uXyhLDBDE!B9gTN*zT`Ku-er|fOXK-Cf6Rbwwu}_tCv_iSTuM;qkm$@9qU2W z;Y14){gk?c&oyxR0-uUZ*D$F)Ar+CXfzfR&>OoNtiTXfvBZvAhv`?Xa2;IVKD*wAq7A4`*)1Op7u-fZ5|q7cIAvrS=(k3mOj zETwxXoY9}4aQy!L(Q*H_e@YS8x9)hKJhyS?-dV{%kVK7NP7ar?tp4Q9Nh<3F@^|n4 ztktCyPM4(qu>GR1-R3JpLU}3Az{eKJ<-T^Gv8~q&SQ-AhFXdGTV1HE-p#w3<$Z`RP zx9`27%f!gT>SY=Kt}huns$NV^G`~sz!9$zl^eeQ@>-wqo$Lo(Mo(`%TRPGIuJ{+M=bL%ji3>-7_N({_{Iy-|IstV!f|>|NNjSAIyS_n_My+e#Hk z;|08KoKyMZ;_zD_FMsy-_u|8olQ>BZ_KpWfcyDkJCwuoF$?+Z@#P`YjDEas?it67Y zzu(||K~CYwjOG2$K`Gx}WMa)Pm++sag~4mI?$NHL*eRM9U`9jg(X}9Sl1@d@v}1nz?M01y%PpmpGO|zq3c)Qyx#=j38J>KZ0*L^8 z0GI%{af4ac$G7EmSRHO>O{g?$WNDV82{Rg=?ZmnHCfLyYa%fNtvY^~S^aQ|3PF)^+ ztnNsm#Gg$_pqk$SY zAuLv04q~zcgs0XjTW?>yV(U%&{;k?*Ub; zHmo-+i5|RI zRpM>I9(*T`k7x6;0)f;0&fRIQdOjXb!%YT)Q@Wcl`AB$q7Z3TTN)#_pKs0M87-4$4 z70X9TOo%yZ1eSXQ8VVz?BQYOdE>mK>WWeOc6Db}E#2psuC;~fRVu!lBix@aC7zGTG z;txnWRKylMV7QMCObN!Bg&95~Qe#DDbe%LDbYpYoa2jqSr};xMi&w<8&bZU?VvO%B zPEWBFPd+f6Bv5*OxJa}s*hq{S9_&`7?W2d3Qn;kV%lP6#jt3y^T-g~$;BXIPU@)SK zC@M!#!_yP8Kq4f&=!A%k1}#24WXo(>Q&pwxfRe~lpSr0Q@@3{08coHh$d_3&iTk-s z+;2o_l(d;PYeVH7DxQ+m>90ERhVofzdd3M%}FI=#!b;E>X8hLMTjj8`# zuE5!KrLrQB6J&hu#tN=+1sF&Z&OVof+T=c&u%%+XhflWt0000004TLD{U87V006y^y?+1z literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc b/v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht/rows/parts/part-0-d277dff1-643a-41df-b9e6-a4dbf5cf99cc deleted file mode 100644 index d5237ae1e9df036cb6c40d3d1560c5485433e88b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 939 zcmV;c162H$0{{TT3IG5owJ-f(#0f`*vOVC zegJj=k^uFds7ktLaogK7Ub&UC%&(Xuxu|!w8LQYKmVXjmi9G6Qixq9&nqyll&Zf39 z0Xl1{#=p}h`p7HVoB5dQjjcu-a4Tm$S1aa7EpA)4-SuXx(OMbrk=tt8YBUp{8xZC* z1D&UC!*>cNE zYOi;!cubkBi1W@o>ua~&WeN_9&*J9!YKya}rB)(#v{=Lz-;A%-dk4E9-T@bc2EX&? z*Xwa{J;?2VC+{>}zF#lLvr`lElyV$5LIWF(e zET9rV;vo3&rx!UvhDg{6;z>R|r!L2{b9$=n*mcJNPggVyI>OixjU7iS-elr8I0>W< z49}I~DNyuCjW#B@kQ)*J{Rk>3gzH2Y@jueUWV!|(tq+i6bTnp|(L#UE@gf8PqVP!e zGGA9!TdZ4>GChm3FiS;Nnp(w1Su`zKX;v-E0bynic%n8c!Kj40#Fmj&px;PHvCA=}Ud<4nj@mfHzM86>-Y-AP{bYTe}SbOpst5`s188fLVjC zR*Zp$A7t0$Kg{tp1`r!ew6%Z2#0L9hT(gNnn65Hl7FN4d@^_wyv`hYkKMeo?00000 N04TLD{U87V008%0x4-}Z