Skip to content

Commit dc9652c

Browse files
committed
fix: refined (thinned out) extraction of variants (#219)
This removes the "xrefs", "citations", "history_records", and "conditions" fields from the classifications of the records written out in the "data extract-vars" command.
1 parent f6894ca commit dc9652c

11 files changed

Lines changed: 65 additions & 23 deletions

File tree

clinvar_data/extract_vars.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from google.protobuf.json_format import MessageToJson, ParseDict
1010
import tqdm
1111

12-
from clinvar_data.pbs.clinvar_public import Allele, VariationArchive
13-
from clinvar_data.pbs.clinvar_public_pb2 import ClassifiedRecord
12+
from clinvar_data.pbs.clinvar_public import Allele, ClassifiedRecord, VariationArchive
13+
from clinvar_data.pbs.clinvar_public_pb2 import AggregateClassificationSet
1414
from clinvar_data.pbs.extracted_vars import (
1515
ExtractedRcvRecord,
1616
ExtractedVcvRecord,
@@ -72,6 +72,27 @@ def from_string_value(cls, string_value: str) -> VariationType.ValueType:
7272
return cls.CONVERT.get(string_value.lower(), VariationType.VARIATION_TYPE_OTHER)
7373

7474

75+
def thin_out_aggregate_classification_set(
76+
classifications: AggregateClassificationSet | None,
77+
) -> AggregateClassificationSet | None:
78+
"""Thin out the aggregate classifications set for extracted variants."""
79+
if classifications is None:
80+
return None
81+
else:
82+
result = AggregateClassificationSet()
83+
result.CopyFrom(classifications)
84+
if result.HasField("germline_classification"):
85+
for key in ("xrefs", "citations", "history_records", "conditions"):
86+
result.germline_classification.ClearField(key)
87+
for somatic_clinical_impacts in result.somatic_clinical_impacts:
88+
for key in ("xrefs", "citations", "history_records", "conditions"):
89+
somatic_clinical_impacts.ClearField(key)
90+
if result.HasField("oncogenicity_classification"):
91+
for key in ("xrefs", "citations", "history_records", "conditions"):
92+
result.oncogenicity_classification.ClearField(key)
93+
return result
94+
95+
7596
def run(path_input: str, output_dir: str, gzip_output: bool):
7697
"""Execute the variant extraction."""
7798
os.makedirs(output_dir, exist_ok=True)
@@ -107,11 +128,12 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
107128
)
108129
rcvs: list[ExtractedRcvRecord] = [
109130
ExtractedRcvRecord(
110-
title=rcva.title,
111131
accession=VersionedAccession(
112132
accession=rcva.accession,
113133
version=rcva.version,
114134
),
135+
title=rcva.title,
136+
classifications=rcva.rcv_classifications,
115137
)
116138
for rcva in classified_record.rcv_list.rcv_accessions
117139
]
@@ -128,7 +150,9 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
128150
rcvs=rcvs,
129151
name=name,
130152
variation_type=variation_type,
131-
classifications=classified_record.classifications,
153+
classifications=(
154+
thin_out_aggregate_classification_set(classified_record.classifications)
155+
),
132156
sequence_location=sequence_location,
133157
hgnc_ids=hgnc_ids,
134158
)

clinvar_data/pbs/class_by_freq_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/clinvar_public_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/clinvar_public_pb2.pyi

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2537,7 +2537,10 @@ class Indication(google.protobuf.message.Message):
25372537
COMMENTS_FIELD_NUMBER: builtins.int
25382538
TYPE_FIELD_NUMBER: builtins.int
25392539
type: global___Indication.Type.ValueType
2540-
"""attributes"""
2540+
"""attributes
2541+
2542+
The type of indication.
2543+
"""
25412544
@property
25422545
def traits(
25432546
self,

clinvar_data/pbs/extracted_vars_pb2.py

Lines changed: 7 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/extracted_vars_pb2.pyi

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,21 +123,39 @@ class ExtractedRcvRecord(google.protobuf.message.Message):
123123

124124
ACCESSION_FIELD_NUMBER: builtins.int
125125
TITLE_FIELD_NUMBER: builtins.int
126+
CLASSIFICATIONS_FIELD_NUMBER: builtins.int
126127
title: builtins.str
127128
"""Title of RCV."""
128129
@property
129130
def accession(self) -> global___VersionedAccession:
130131
"""The accession."""
131132

133+
@property
134+
def classifications(
135+
self,
136+
) -> clinvar_data.pbs.clinvar_public_pb2.RcvAccession.RcvClassifications:
137+
"""Classifications (thinned out)."""
138+
132139
def __init__(
133140
self,
134141
*,
135142
accession: global___VersionedAccession | None = ...,
136143
title: builtins.str = ...,
144+
classifications: (
145+
clinvar_data.pbs.clinvar_public_pb2.RcvAccession.RcvClassifications | None
146+
) = ...,
137147
) -> None: ...
138-
def HasField(self, field_name: typing.Literal["accession", b"accession"]) -> builtins.bool: ...
148+
def HasField(
149+
self,
150+
field_name: typing.Literal[
151+
"accession", b"accession", "classifications", b"classifications"
152+
],
153+
) -> builtins.bool: ...
139154
def ClearField(
140-
self, field_name: typing.Literal["accession", b"accession", "title", b"title"]
155+
self,
156+
field_name: typing.Literal[
157+
"accession", b"accession", "classifications", b"classifications", "title", b"title"
158+
],
141159
) -> None: ...
142160

143161
global___ExtractedRcvRecord = ExtractedRcvRecord
@@ -173,7 +191,7 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
173191

174192
@property
175193
def classifications(self) -> clinvar_data.pbs.clinvar_public_pb2.AggregateClassificationSet:
176-
"""Classifications."""
194+
"""Classifications (thinned out)."""
177195

178196
@property
179197
def sequence_location(self) -> clinvar_data.pbs.clinvar_public_pb2.Location.SequenceLocation:

clinvar_data/pbs/gene_impact_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clinvar_data/pbs/phenotype_link_pb2.py

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

protos/clinvar_data/pbs/extracted_vars.proto

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ message ExtractedRcvRecord {
5252
VersionedAccession accession = 1;
5353
// Title of RCV.
5454
string title = 2;
55+
// Classifications (thinned out).
56+
clinvar_data.pbs.clinvar_public.RcvAccession.RcvClassifications classifications = 3;
5557
}
5658

5759
// Protocol buffer for storing essential information of one VCV.
@@ -64,7 +66,7 @@ message ExtractedVcvRecord {
6466
string name = 3;
6567
// The type of the variant.
6668
VariationType variation_type = 4;
67-
// Classifications.
69+
// Classifications (thinned out).
6870
clinvar_data.pbs.clinvar_public.AggregateClassificationSet classifications = 5;
6971
// The sequence location on one reference.
7072
clinvar_data.pbs.clinvar_public.Location.SequenceLocation sequence_location = 6;
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome"}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "conditions": [{"traits": [{"names": [{"value": "Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome", "type": "Alternate"}, {"value": "Pierre Robin syndrome with hyperphalangy and clinodactyly", "type": "Alternate"}, {"value": "Palatodigital syndrome Catel-Manzke type", "type": "Alternate"}, {"value": "Index finger anomaly with Pierre Robin syndrome", "type": "Alternate"}, {"value": "Catel-Manzke syndrome", "type": "Preferred", "xrefs": [{"db": "MONDO", "id": "MONDO:0014507"}]}, {"value": "MICROGNATHIA DIGITAL SYNDROME", "type": "Alternate", "xrefs": [{"db": "OMIM", "id": "616145", "type": "MIM"}]}], "symbols": [{"value": "CATMANS", "type": "Alternate", "xrefs": [{"db": "OMIM", "id": "616145", "type": "MIM"}]}], "attributes": [{"attribute": {"base": {"integerValue": "28"}, "type": "GARD id"}, "xrefs": [{"db": "Office of Rare Diseases", "id": "28"}]}], "xrefs": [{"db": "Orphanet", "id": "1388"}, {"db": "MedGen", "id": "C1844887"}, {"db": "MONDO", "id": "MONDO:0014507"}, {"db": "OMIM", "id": "616145", "type": "MIM"}]}], "type": "TYPE_DISEASE", "id": "20503", "contributesToAggregateClassification": true}], "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}
1+
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}

0 commit comments

Comments
 (0)