Skip to content

Commit 599faa7

Browse files
authored
feat: implement GCNV (#1112)
* transcripts * ruff * ruff * delete dead code * finish off test * remove enabled property * export more variants * ruff * get mocking correct
1 parent e1628a3 commit 599faa7

27 files changed

+443
-61
lines changed

v03_pipeline/lib/model/dataset_type.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,8 +393,8 @@ def should_export_to_vcf(self):
393393
return self == DatasetType.SV
394394

395395
@property
396-
def should_export_to_parquet(self):
397-
return self == DatasetType.SNV_INDEL
396+
def export_all_callset_variants(self):
397+
return self == DatasetType.GCNV
398398

399399
@property
400400
def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:

v03_pipeline/lib/tasks/exports/fields.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ def get_dataset_type_specific_annotations(
6060
'svType': ht.sv_type,
6161
'svTypeDetail': ht.sv_type_detail,
6262
},
63+
DatasetType.GCNV: lambda ht: {
64+
'numExon': ht.num_exon,
65+
'svType': ht.sv_type,
66+
},
6367
}[dataset_type](ht)
6468

6569

@@ -92,6 +96,20 @@ def get_calls_export_fields(
9296
prevCall=fe.concordance.prev_call,
9397
prevNumAlt=fe.concordance.prev_num_alt,
9498
),
99+
DatasetType.GCNV: lambda fe: hl.Struct(
100+
sampleId=fe.s,
101+
gt=fe.GT.n_alt_alleles(),
102+
cn=fe.CN,
103+
qs=fe.QS,
104+
defragged=fe.defragged,
105+
start=fe.sample_start,
106+
end=fe.sample_end,
107+
numExon=fe.sample_num_exon,
108+
geneIds=fe.sample_gene_ids,
109+
newCall=fe.concordance.new_call,
110+
prevCall=fe.concordance.prev_call,
111+
prevOverlap=fe.concordance.prev_overlap,
112+
),
95113
}[dataset_type](fe)
96114

97115

@@ -162,6 +180,9 @@ def get_predictions_export_fields(
162180
DatasetType.SV: lambda ht: {
163181
'strvctvre': ht.strvctvre.score,
164182
},
183+
DatasetType.GCNV: lambda ht: {
184+
'strvctvre': ht.strvctvre.score,
185+
},
165186
}[dataset_type](ht)
166187

167188

@@ -233,6 +254,15 @@ def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
233254
id=ht.gnomad_svs.ID,
234255
),
235256
},
257+
DatasetType.GCNV: lambda ht: {
258+
'seqrPop': hl.Struct(
259+
af=ht.gt_stats.AF,
260+
ac=ht.gt_stats.AC,
261+
an=ht.gt_stats.AN,
262+
Hom=ht.gt_stats.Hom,
263+
Het=ht.gt_stats.Het,
264+
),
265+
},
236266
}[dataset_type](ht)
237267

238268

@@ -241,7 +271,7 @@ def get_position_fields(ht: hl.Table, dataset_type: DatasetType):
241271
return {
242272
'chrom': reference_independent_contig(ht.start_locus),
243273
'pos': ht.start_locus.position,
244-
'end_locus': ht.end_locus.position,
274+
'end': ht.end_locus.position,
245275
'rg37LocusEnd': hl.Struct(
246276
contig=reference_independent_contig(ht.rg37_locus_end),
247277
position=ht.rg37_locus_end.position,
@@ -272,6 +302,9 @@ def get_variant_id_fields(
272302
DatasetType.SV: lambda ht: {
273303
'variantId': ht.variant_id,
274304
},
305+
DatasetType.GCNV: lambda ht: {
306+
'variantId': ht.variant_id,
307+
},
275308
}[dataset_type](ht)
276309

277310

v03_pipeline/lib/tasks/exports/selects.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,16 @@
3535
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
3636
TEST_MITO_CALLSET = 'v03_pipeline/var/test/callsets/mito_1.mt'
3737
TEST_SV_VCF_2 = 'v03_pipeline/var/test/callsets/sv_2.vcf'
38+
TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv'
3839

3940

4041
TEST_RUN_ID = 'manual__2024-04-03'
4142

4243

4344
class WriteNewEntriesParquetTest(MockedReferenceDatasetsTestCase):
4445
def setUp(self) -> None:
46+
# NOTE: The annotations tables are mocked for SNV_INDEL & MITO
47+
# to avoid reference dataset updates that SV/GCNV do not have.
4548
super().setUp()
4649
mt = import_callset(
4750
TEST_SNV_INDEL_VCF,
@@ -417,3 +420,167 @@ def test_sv_write_new_entries_parquet(self):
417420
},
418421
],
419422
)
423+
424+
def test_gcnv_write_new_entries_parquet(self):
425+
worker = luigi.worker.Worker()
426+
task = WriteNewEntriesParquetTask(
427+
reference_genome=ReferenceGenome.GRCh38,
428+
dataset_type=DatasetType.GCNV,
429+
sample_type=SampleType.WES,
430+
callset_path=TEST_GCNV_BED_FILE,
431+
project_guids=['R0115_test_project2'],
432+
project_pedigree_paths=[TEST_PEDIGREE_5],
433+
skip_validation=True,
434+
run_id=TEST_RUN_ID,
435+
)
436+
worker.add(task)
437+
worker.run()
438+
self.assertTrue(task.output().exists())
439+
self.assertTrue(task.complete())
440+
df = pd.read_parquet(
441+
new_entries_parquet_path(
442+
ReferenceGenome.GRCh38,
443+
DatasetType.GCNV,
444+
TEST_RUN_ID,
445+
),
446+
)
447+
export_json = convert_ndarray_to_list(df.to_dict('records'))
448+
self.assertEqual(
449+
export_json,
450+
[
451+
{
452+
'key': 0,
453+
'project_guid': 'R0115_test_project2',
454+
'family_guid': 'family_2_1',
455+
'sample_type': 'WES',
456+
'xpos': 1100006937,
457+
'filters': [],
458+
'calls': [
459+
{
460+
'sampleId': 'RGP_164_1',
461+
'gt': 1,
462+
'cn': 1,
463+
'qs': 4,
464+
'defragged': False,
465+
'start': 100006937,
466+
'end': 100007881,
467+
'numExon': 2.0,
468+
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
469+
'newCall': False,
470+
'prevCall': True,
471+
'prevOverlap': False,
472+
},
473+
{
474+
'sampleId': 'RGP_164_2',
475+
'gt': 1,
476+
'cn': 1,
477+
'qs': 5,
478+
'defragged': False,
479+
'start': 100017585,
480+
'end': 100023213,
481+
'numExon': None,
482+
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
483+
'newCall': False,
484+
'prevCall': False,
485+
'prevOverlap': False,
486+
},
487+
{
488+
'sampleId': 'RGP_164_3',
489+
'gt': 2,
490+
'cn': 0,
491+
'qs': 30,
492+
'defragged': False,
493+
'start': 100017585,
494+
'end': 100023213,
495+
'numExon': None,
496+
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
497+
'newCall': False,
498+
'prevCall': True,
499+
'prevOverlap': False,
500+
},
501+
{
502+
'sampleId': 'RGP_164_4',
503+
'gt': 2,
504+
'cn': 0.0,
505+
'qs': 30.0,
506+
'defragged': False,
507+
'start': 100017586.0,
508+
'end': 100023212.0,
509+
'numExon': 2.0,
510+
'geneIds': ['ENSG00000283761', 'ENSG22222222222'],
511+
'newCall': False,
512+
'prevCall': True,
513+
'prevOverlap': False,
514+
},
515+
],
516+
'sign': 1,
517+
},
518+
{
519+
'key': 1,
520+
'project_guid': 'R0115_test_project2',
521+
'family_guid': 'family_2_1',
522+
'sample_type': 'WES',
523+
'xpos': 1100017586,
524+
'filters': [],
525+
'calls': [
526+
{
527+
'sampleId': 'RGP_164_1',
528+
'gt': None,
529+
'cn': None,
530+
'qs': None,
531+
'defragged': None,
532+
'start': None,
533+
'end': None,
534+
'numExon': None,
535+
'geneIds': None,
536+
'newCall': None,
537+
'prevCall': None,
538+
'prevOverlap': None,
539+
},
540+
{
541+
'sampleId': 'RGP_164_2',
542+
'gt': None,
543+
'cn': None,
544+
'qs': None,
545+
'defragged': None,
546+
'start': None,
547+
'end': None,
548+
'numExon': None,
549+
'geneIds': None,
550+
'newCall': None,
551+
'prevCall': None,
552+
'prevOverlap': None,
553+
},
554+
{
555+
'sampleId': 'RGP_164_3',
556+
'gt': 2,
557+
'cn': 0.0,
558+
'qs': 30.0,
559+
'defragged': False,
560+
'start': None,
561+
'end': None,
562+
'numExon': None,
563+
'geneIds': None,
564+
'newCall': False,
565+
'prevCall': True,
566+
'prevOverlap': False,
567+
},
568+
{
569+
'sampleId': 'RGP_164_4',
570+
'gt': None,
571+
'cn': None,
572+
'qs': None,
573+
'defragged': None,
574+
'start': None,
575+
'end': None,
576+
'numExon': None,
577+
'geneIds': None,
578+
'newCall': None,
579+
'prevCall': None,
580+
'prevOverlap': None,
581+
},
582+
],
583+
'sign': 1,
584+
},
585+
],
586+
)

v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
import luigi
33
import luigi.util
44

5+
from v03_pipeline.lib.misc.callsets import get_callset_ht
56
from v03_pipeline.lib.paths import (
67
new_transcripts_parquet_path,
78
new_variants_table_path,
9+
variant_annotations_table_path,
810
)
911
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
1012
BaseLoadingRunParams,
@@ -20,6 +22,9 @@
2022
from v03_pipeline.lib.tasks.update_new_variants_with_caids import (
2123
UpdateNewVariantsWithCAIDsTask,
2224
)
25+
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
26+
UpdateVariantAnnotationsTableWithNewSamplesTask,
27+
)
2328
from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask
2429

2530

@@ -37,21 +42,36 @@ def output(self) -> luigi.Target:
3742
def complete(self) -> luigi.Target:
3843
return GCSorLocalFolderTarget(self.output().path).exists()
3944

40-
def requires(self) -> list[luigi.Task]:
41-
return [
42-
self.clone(UpdateNewVariantsWithCAIDsTask)
43-
if self.dataset_type.should_send_to_allele_registry
44-
else self.clone(WriteNewVariantsTableTask),
45-
]
45+
def requires(self) -> luigi.Task:
46+
if self.dataset_type.export_all_callset_variants:
47+
return self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
48+
if self.dataset_type.should_send_to_allele_registry:
49+
return self.clone(UpdateNewVariantsWithCAIDsTask)
50+
return self.clone(WriteNewVariantsTableTask)
4651

4752
def create_table(self) -> None:
48-
ht = hl.read_table(
49-
new_variants_table_path(
53+
if self.dataset_type.export_all_callset_variants:
54+
ht = hl.read_table(
55+
variant_annotations_table_path(
56+
self.reference_genome,
57+
self.dataset_type,
58+
),
59+
)
60+
callset_ht = get_callset_ht(
5061
self.reference_genome,
5162
self.dataset_type,
52-
self.run_id,
53-
),
54-
)
63+
self.callset_path,
64+
self.project_guids,
65+
)
66+
ht = ht.semi_join(callset_ht)
67+
else:
68+
ht = hl.read_table(
69+
new_variants_table_path(
70+
self.reference_genome,
71+
self.dataset_type,
72+
self.run_id,
73+
),
74+
)
5575
ht = unmap_formatting_annotation_enums(
5676
ht,
5777
self.reference_genome,

0 commit comments

Comments
 (0)