Skip to content

Commit 19ecf18

Browse files
authored
feat: extract-vars writes thinned-out SCV infos (#240) (#258)
1 parent 66671cd commit 19ecf18

10 files changed

Lines changed: 66 additions & 13 deletions

File tree

clinvar_data/extract_vars.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
import tqdm
1111

1212
from clinvar_data.pbs.clinvar_public import Allele, ClassifiedRecord, VariationArchive
13-
from clinvar_data.pbs.clinvar_public_pb2 import AggregateClassificationSet
13+
from clinvar_data.pbs.clinvar_public_pb2 import (
14+
AggregateClassificationSet,
15+
ClinicalAssertion,
16+
)
1417
from clinvar_data.pbs.extracted_vars import (
1518
ExtractedRcvRecord,
1619
ExtractedVcvRecord,
@@ -93,6 +96,43 @@ def thin_out_aggregate_classification_set(
9396
return result
9497

9598

99+
def thin_out_clinical_assertions(
100+
clinical_assertions: typing.Iterable[ClinicalAssertion],
101+
) -> list[ClinicalAssertion]:
102+
result = []
103+
for clinical_assertion in clinical_assertions:
104+
entry = ClinicalAssertion()
105+
entry.CopyFrom(clinical_assertion)
106+
for key in (
107+
"clinvar_submission_id",
108+
"additional_submitters",
109+
"record_status",
110+
"attributes",
111+
"observed_ins",
112+
"simple_allele",
113+
"haplotype",
114+
"genotype",
115+
"trait_set",
116+
"citations",
117+
"study_name",
118+
"study_description",
119+
"comments",
120+
"submission_names",
121+
"date_created",
122+
"date_last_updated",
123+
"submission_date",
124+
"id",
125+
"fda_recognized_database",
126+
):
127+
entry.ClearField(key)
128+
if entry.HasField("clinvar_accession"):
129+
entry.clinvar_accession.ClearField("submitter_identifiers")
130+
if entry.HasField("classifications"):
131+
entry.classifications.ClearField("comments")
132+
result.append(entry)
133+
return result
134+
135+
96136
def run(path_input: str, output_dir: str, gzip_output: bool):
97137
"""Execute the variant extraction."""
98138
os.makedirs(output_dir, exist_ok=True)
@@ -142,7 +182,6 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
142182
for gene in classified_record.simple_allele.genes
143183
if gene.HasField("hgnc_id")
144184
]
145-
146185
for location in simple_allele.locations or []:
147186
for sequence_location in location.sequence_locations or []:
148187
record = ExtractedVcvRecord(
@@ -153,6 +192,9 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
153192
classifications=(
154193
thin_out_aggregate_classification_set(classified_record.classifications)
155194
),
195+
clinical_assertions=(
196+
thin_out_clinical_assertions(classified_record.clinical_assertions)
197+
),
156198
sequence_location=sequence_location,
157199
hgnc_ids=hgnc_ids,
158200
)

clinvar_data/pbs/class_by_freq_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/clinvar_public_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/extracted_vars_pb2.py

Lines changed: 4 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/extracted_vars_pb2.pyi

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
171171
NAME_FIELD_NUMBER: builtins.int
172172
VARIATION_TYPE_FIELD_NUMBER: builtins.int
173173
CLASSIFICATIONS_FIELD_NUMBER: builtins.int
174+
CLINICAL_ASSERTIONS_FIELD_NUMBER: builtins.int
174175
SEQUENCE_LOCATION_FIELD_NUMBER: builtins.int
175176
HGNC_IDS_FIELD_NUMBER: builtins.int
176177
name: builtins.str
@@ -193,6 +194,14 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
193194
def classifications(self) -> clinvar_data.pbs.clinvar_public_pb2.AggregateClassificationSet:
194195
"""Classifications (thinned out)."""
195196

197+
@property
198+
def clinical_assertions(
199+
self,
200+
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
201+
clinvar_data.pbs.clinvar_public_pb2.ClinicalAssertion
202+
]:
203+
"""Clinical assertions (thinned out),"""
204+
196205
@property
197206
def sequence_location(self) -> clinvar_data.pbs.clinvar_public_pb2.Location.SequenceLocation:
198207
"""The sequence location on one reference."""
@@ -213,6 +222,9 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
213222
classifications: (
214223
clinvar_data.pbs.clinvar_public_pb2.AggregateClassificationSet | None
215224
) = ...,
225+
clinical_assertions: (
226+
collections.abc.Iterable[clinvar_data.pbs.clinvar_public_pb2.ClinicalAssertion] | None
227+
) = ...,
216228
sequence_location: (
217229
clinvar_data.pbs.clinvar_public_pb2.Location.SequenceLocation | None
218230
) = ...,
@@ -236,6 +248,8 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
236248
b"accession",
237249
"classifications",
238250
b"classifications",
251+
"clinical_assertions",
252+
b"clinical_assertions",
239253
"hgnc_ids",
240254
b"hgnc_ids",
241255
"name",

clinvar_data/pbs/gene_impact_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/phenotype_link_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

protos/clinvar_data/pbs/extracted_vars.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ message ExtractedVcvRecord {
6868
VariationType variation_type = 4;
6969
// Classifications (thinned out).
7070
clinvar_data.pbs.clinvar_public.AggregateClassificationSet classifications = 5;
71+
// Clinical assertions (thinned out),
72+
repeated clinvar_data.pbs.clinvar_public.ClinicalAssertion clinical_assertions = 8;
7173
// The sequence location on one reference.
7274
clinvar_data.pbs.clinvar_public.Location.SequenceLocation sequence_location = 6;
7375
// List of HGNC IDs.
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}
1+
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"], "clinicalAssertions": [{"clinvarAccession": {"accession": "SCV001433049", "version": 1, "dateUpdated": "2020-09-27T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z"}, "classifications": {"reviewStatus": "SUBMITTER_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "germlineClassification": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z"}, "assertion": "ASSERTION_VARIATION_TO_DISEASE"}]}
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"forDisplay": true, "assembly": "GRCh38", "chr": "CHROMOSOME_2", "accession": "NC_000002.12", "start": 142927694, "stop": 142927694, "displayStart": 142927694, "displayStop": 142927694, "variantLength": 1, "positionVcf": 142927694, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}
1+
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"forDisplay": true, "assembly": "GRCh38", "chr": "CHROMOSOME_2", "accession": "NC_000002.12", "start": 142927694, "stop": 142927694, "displayStart": 142927694, "displayStop": 142927694, "variantLength": 1, "positionVcf": 142927694, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"], "clinicalAssertions": [{"clinvarAccession": {"accession": "SCV001433049", "version": 1, "dateUpdated": "2020-09-27T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z"}, "classifications": {"reviewStatus": "SUBMITTER_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "germlineClassification": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z"}, "assertion": "ASSERTION_VARIATION_TO_DISEASE"}]}

0 commit comments

Comments
 (0)