broadinstitute
diff --git a/‎v03_pipeline/lib/misc/clickhouse.py
Lines changed: 5 additions & 3 deletions b/‎v03_pipeline/lib/misc/clickhouse.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎v03_pipeline/lib/model/dataset_type.py
Lines changed: 3 additions & 31 deletions b/‎v03_pipeline/lib/model/dataset_type.py
Lines changed: 3 additions & 31 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/fields.py
Lines changed: 30 additions & 17 deletions b/‎v03_pipeline/lib/tasks/exports/fields.py
Lines changed: 30 additions & 17 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/misc.py
Lines changed: 54 additions & 25 deletions b/‎v03_pipeline/lib/tasks/exports/misc.py
Lines changed: 54 additions & 25 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py
Lines changed: 9 additions & 46 deletions b/‎v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py
Lines changed: 9 additions & 46 deletions
@@ -55,7 +55,7 @@ def src_path_fn(self) -> Callable:
             ClickHouseTable.ENTRIES: new_entries_parquet_path,
         }[self]
 
-    def should_load(
+    def should_load(  # noqa: PLR0911
         self,
         reference_genome: ReferenceGenome,
         dataset_type: DatasetType,
@@ -73,11 +73,12 @@ def should_load(
                             dataset_type,
                         )
                     )
+                if self == ClickHouseTable.TRANSCRIPTS:
+                    return dataset_type.should_write_new_transcripts
                 return self in {
                     ClickHouseTable.ANNOTATIONS_DISK,
                     ClickHouseTable.ANNOTATIONS_MEMORY,
                     ClickHouseTable.KEY_LOOKUP,
-                    ClickHouseTable.TRANSCRIPTS,
                 }
             msg = f'Unhandled ClickHouseMigrationType: {migration_type.value}'
             raise ValueError(
@@ -91,11 +92,12 @@ def should_load(
                     dataset_type,
                 )
             )
+        if self == ClickHouseTable.TRANSCRIPTS:
+            return dataset_type.should_write_new_transcripts
         return self in {
             ClickHouseTable.ANNOTATIONS_DISK,
             ClickHouseTable.ANNOTATIONS_MEMORY,
             ClickHouseTable.KEY_LOOKUP,
-            ClickHouseTable.TRANSCRIPTS,
             ClickHouseTable.ENTRIES,
         }
 
 
@@ -1,4 +1,3 @@
-from collections import OrderedDict
 from collections.abc import Callable
 from enum import StrEnum
 
@@ -406,36 +405,9 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
             ],
         }[self]
 
-    def export_parquet_filterable_transcripts_fields(
-        self,
-        reference_genome: ReferenceGenome,
-    ) -> OrderedDict[str, str]:
-        fields = ['geneId']
-        if self in {DatasetType.SV, DatasetType.GCNV}:
-            fields = [
-                *fields,
-                'majorConsequence',
-            ]
-        if self in {DatasetType.SNV_INDEL, DatasetType.MITO}:
-            fields = [
-                *fields,
-                'canonical',
-                'consequenceTerms',
-            ]
-        fields = {
-            # above fields are renamed to themselves
-            k: k
-            for k in fields
-        }
-        if self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38:
-            fields = {
-                **fields,
-                'alphamissensePathogenicity': 'alphamissense.pathogenicity',
-                'extendedIntronicSpliceRegionVariant': 'spliceregion.extended_intronic_splice_region_variant',
-                'fiveutrConsequence': 'utrannotator.fiveutrConsequence',
-            }
-        # Parquet export expects all fields sorted alphabetically
-        return OrderedDict(sorted(fields.items()))
+    @property
+    def should_write_new_transcripts(self):
+        return self == DatasetType.SNV_INDEL
 
     @property
     def overwrite_male_non_par_calls(self) -> None:
 
@@ -1,9 +1,7 @@
 import hail as hl
 
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
-from v03_pipeline.lib.tasks.exports.misc import (
-    transcripts_field_name,
-)
+from v03_pipeline.lib.tasks.exports.misc import reformat_transcripts_for_export
 
 
 def reference_independent_contig(locus: hl.LocusExpression):
@@ -256,11 +254,11 @@ def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
         },
         DatasetType.GCNV: lambda ht: {
             'seqrPop': hl.Struct(
-                af=ht.gt_stats.AF,
                 ac=ht.gt_stats.AC,
+                af=ht.gt_stats.AF,
                 an=ht.gt_stats.AN,
-                Hom=ht.gt_stats.Hom,
-                Het=ht.gt_stats.Het,
+                het=ht.gt_stats.Het,
+                hom=ht.gt_stats.Hom,
             ),
         },
     }[dataset_type](ht)
@@ -313,17 +311,32 @@ def get_consequences_fields(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
 ):
-    consequences_field = transcripts_field_name(reference_genome, dataset_type)
-    if (
-        reference_genome == ReferenceGenome.GRCh38
-        and dataset_type == DatasetType.SNV_INDEL
-    ):
-        return {
-            'sortedMotifFeatureConsequences': ht.sortedMotifFeatureConsequences,
-            'sortedRegulatoryFeatureConsequences': ht.sortedRegulatoryFeatureConsequences,
-            consequences_field: ht[consequences_field],
-        }
-    return {consequences_field: ht[consequences_field]}
+    return {
+        DatasetType.SNV_INDEL: lambda ht: {
+            **(
+                {
+                    'sortedMotifFeatureConsequences': ht.sortedMotifFeatureConsequences,
+                    'sortedRegulatoryFeatureConsequences': ht.sortedRegulatoryFeatureConsequences,
+                }
+                if reference_genome == ReferenceGenome.GRCh38
+                else {}
+            ),
+            'sortedTranscriptConsequences': ht.sortedTranscriptConsequences,
+        },
+        DatasetType.MITO: lambda ht: {
+            # MITO transcripts are not exported to their own table,
+            # but the structure should be preserved here.
+            'sortedTranscriptConsequences': hl.enumerate(
+                ht.sortedTranscriptConsequences,
+            ).starmap(reformat_transcripts_for_export),
+        },
+        DatasetType.SV: lambda ht: {
+            'sortedGeneConsequences': ht.sortedGeneConsequences,
+        },
+        DatasetType.GCNV: lambda ht: {
+            'sortedGeneConsequences': ht.sortedGeneConsequences,
+        },
+    }[dataset_type](ht)
 
 
 def get_variants_export_fields(
 
@@ -1,3 +1,5 @@
+from collections import OrderedDict
+
 import hail as hl
 
 from v03_pipeline.lib.annotations.enums import (
@@ -44,40 +46,67 @@ def array_structexpression_fields(ht: hl.Table):
     ]
 
 
-def transcripts_field_name(
+def reformat_transcripts_for_export(i: int, s: hl.StructExpression):
+    formatted_s = (
+        s.annotate(
+            majorConsequence=s.consequenceTerms.first(),
+            transcriptRank=i,
+        )
+        if hasattr(s, 'loftee')
+        else s.annotate(
+            loftee=hl.Struct(
+                isLofNagnag=s.isLofNagnag,
+                lofFilters=s.lofFilters,
+            ),
+            majorConsequence=s.consequenceTerms.first(),
+            transcriptRank=i,
+        ).drop('isLofNagnag', 'lofFilters')
+    )
+    return sorted_hl_struct(formatted_s)
+
+
+def export_parquet_filterable_transcripts_fields(
     reference_genome: ReferenceGenome,
-    dataset_type: DatasetType,
-) -> str:
-    formatting_annotation_names = {
-        fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome)
+) -> OrderedDict[str, str]:
+    fields = {
+        k: k
+        for k in [
+            'canonical',
+            'consequenceTerms',
+            'geneId',
+        ]
     }
-    if 'sorted_gene_consequences' in formatting_annotation_names:
-        return snake_to_camelcase('sorted_gene_consequences')
-    return snake_to_camelcase('sorted_transcript_consequences')
+    if reference_genome == ReferenceGenome.GRCh38:
+        fields = {
+            **fields,
+            'alphamissensePathogenicity': 'alphamissense.pathogenicity',
+            'extendedIntronicSpliceRegionVariant': 'spliceregion.extended_intronic_splice_region_variant',
+            'fiveutrConsequence': 'utrannotator.fiveutrConsequence',
+        }
+    # Parquet export expects all fields sorted alphabetically
+    return OrderedDict(sorted(fields.items()))
 
 
-def subset_filterable_transcripts_fields(
+def subset_sorted_transcript_consequences_fields(
     ht: hl.Table,
     reference_genome: ReferenceGenome,
-    dataset_type: DatasetType,
 ) -> hl.Table:
-    field_name = transcripts_field_name(reference_genome, dataset_type)
     return ht.annotate(
-        **{
-            field_name: hl.enumerate(ht[field_name]).starmap(
-                lambda idx, c: c.select(
-                    **{
-                        new_nested_field_name: parse_nested_field(
-                            ht[field_name],
-                            existing_nested_field_name,
-                        )[idx]
-                        for new_nested_field_name, existing_nested_field_name in dataset_type.export_parquet_filterable_transcripts_fields(
-                            reference_genome,
-                        ).items()
-                    },
-                ),
+        sortedTranscriptConsequences=hl.enumerate(
+            ht.sortedTranscriptConsequences,
+        ).starmap(
+            lambda idx, c: c.select(
+                **{
+                    new_field_name: parse_nested_field(
+                        ht.sortedTranscriptConsequences,
+                        existing_field_name,
+                    )[idx]
+                    for new_field_name, existing_field_name in export_parquet_filterable_transcripts_fields(
+                        reference_genome,
+                    ).items()
+                },
             ),
-        },
+        ),
     )
 
 
 
@@ -2,30 +2,23 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.callsets import get_callset_ht
 from v03_pipeline.lib.paths import (
     new_transcripts_parquet_path,
     new_variants_table_path,
-    variant_annotations_table_path,
 )
 from v03_pipeline.lib.tasks.base.base_loading_run_params import (
     BaseLoadingRunParams,
 )
 from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask
 from v03_pipeline.lib.tasks.exports.misc import (
     camelcase_array_structexpression_fields,
-    sorted_hl_struct,
-    transcripts_field_name,
+    reformat_transcripts_for_export,
     unmap_formatting_annotation_enums,
 )
 from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget, GCSorLocalTarget
 from v03_pipeline.lib.tasks.update_new_variants_with_caids import (
     UpdateNewVariantsWithCAIDsTask,
 )
-from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
-    UpdateVariantAnnotationsTableWithNewSamplesTask,
-)
-from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask
 
 
 @luigi.util.inherits(BaseLoadingRunParams)
@@ -43,35 +36,16 @@ def complete(self) -> luigi.Target:
         return GCSorLocalFolderTarget(self.output().path).exists()
 
     def requires(self) -> luigi.Task:
-        if self.dataset_type.export_all_callset_variants:
-            return self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
-        if self.dataset_type.should_send_to_allele_registry:
-            return self.clone(UpdateNewVariantsWithCAIDsTask)
-        return self.clone(WriteNewVariantsTableTask)
+        return self.clone(UpdateNewVariantsWithCAIDsTask)
 
     def create_table(self) -> None:
-        if self.dataset_type.export_all_callset_variants:
-            ht = hl.read_table(
-                variant_annotations_table_path(
-                    self.reference_genome,
-                    self.dataset_type,
-                ),
-            )
-            callset_ht = get_callset_ht(
+        ht = hl.read_table(
+            new_variants_table_path(
                 self.reference_genome,
                 self.dataset_type,
-                self.callset_path,
-                self.project_guids,
-            )
-            ht = ht.semi_join(callset_ht)
-        else:
-            ht = hl.read_table(
-                new_variants_table_path(
-                    self.reference_genome,
-                    self.dataset_type,
-                    self.run_id,
-                ),
-            )
+                self.run_id,
+            ),
+        )
         ht = unmap_formatting_annotation_enums(
             ht,
             self.reference_genome,
@@ -86,17 +60,6 @@ def create_table(self) -> None:
         return ht.select(
             key_=ht.key_,
             transcripts=hl.enumerate(
-                ht[transcripts_field_name(self.reference_genome, self.dataset_type)],
-            )
-            .starmap(
-                lambda i, s: (
-                    s
-                    if hasattr(s, 'majorConsequence')
-                    else s.annotate(
-                        majorConsequence=s.consequenceTerms.first(),
-                        transcriptRank=i,
-                    )
-                ),
-            )
-            .map(sorted_hl_struct),
+                ht.sortedTranscriptConsequences,
+            ).starmap(reformat_transcripts_for_export),
         )
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def src_path_fn(self) -> Callable:`
`55`	`55`	`ClickHouseTable.ENTRIES: new_entries_parquet_path,`
`56`	`56`	`}[self]`
`57`	`57`
`58`		`- def should_load(`
	`58`	`+ def should_load( # noqa: PLR0911`
`59`	`59`	`self,`
`60`	`60`	`reference_genome: ReferenceGenome,`
`61`	`61`	`dataset_type: DatasetType,`
`@@ -73,11 +73,12 @@ def should_load(`
`73`	`73`	`dataset_type,`
`74`	`74`	`)`
`75`	`75`	`)`
	`76`	`+ if self == ClickHouseTable.TRANSCRIPTS:`
	`77`	`+ return dataset_type.should_write_new_transcripts`
`76`	`78`	`return self in {`
`77`	`79`	`ClickHouseTable.ANNOTATIONS_DISK,`
`78`	`80`	`ClickHouseTable.ANNOTATIONS_MEMORY,`
`79`	`81`	`ClickHouseTable.KEY_LOOKUP,`
`80`		`- ClickHouseTable.TRANSCRIPTS,`
`81`	`82`	`}`
`82`	`83`	`msg = f'Unhandled ClickHouseMigrationType: {migration_type.value}'`
`83`	`84`	`raise ValueError(`
`@@ -91,11 +92,12 @@ def should_load(`
`91`	`92`	`dataset_type,`
`92`	`93`	`)`
`93`	`94`	`)`
	`95`	`+ if self == ClickHouseTable.TRANSCRIPTS:`
	`96`	`+ return dataset_type.should_write_new_transcripts`
`94`	`97`	`return self in {`
`95`	`98`	`ClickHouseTable.ANNOTATIONS_DISK,`
`96`	`99`	`ClickHouseTable.ANNOTATIONS_MEMORY,`
`97`	`100`	`ClickHouseTable.KEY_LOOKUP,`
`98`		`- ClickHouseTable.TRANSCRIPTS,`
`99`	`101`	`ClickHouseTable.ENTRIES,`
`100`	`102`	`}`
`101`	`103`