Skip to content

Commit 527b686

Browse files
committed
Merge branch 'master' into jdaw/add-indel-encoding
2 parents 6aa68cb + 825885c commit 527b686

11 files changed

+184
-104
lines changed

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
#
1616

17+
bgzip==0.3.3
1718
h5py==2.10.0
1819
nemo-toolkit==0.10.1
1920
numpy==1.18.3

setup.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import os
2323

24-
from setuptools import find_packages, setup
24+
from setuptools import setup, find_packages
2525

2626

2727
def get_verified_absolute_path(path):
@@ -83,8 +83,7 @@ def get_installation_requirments(file_path):
8383
get_verified_absolute_path(
8484
os.path.join(current_dir, 'requirements.txt')))
8585
],
86-
packages=find_packages(where=current_dir,
87-
include=['variantworks']),
86+
packages=find_packages(where=current_dir, include=["variantworks*"]),
8887
python_requires='>=3.7',
8988
long_description='Python libraries and utilities for manipulating '
9089
'genomics data',

tests/conftest.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#
2+
# Copyright 2020 NVIDIA CORPORATION.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
"""Share fixtures across multiple test fies."""
18+
19+
import bgzip
20+
import os
21+
import pytest
22+
import subprocess
23+
import tempfile
24+
25+
26+
@pytest.fixture(scope='function')
27+
def get_created_vcf_tabix_files():
28+
"""Fixture for creating compressed vcf file and corresponding tabix file from bytes string.
29+
30+
Returns:
31+
A function which creates these files
32+
"""
33+
def created_vcf_tabix_files(vcf_content):
34+
_, tmp_input_path = tempfile.mkstemp(prefix='vw_test_file_', suffix='.vcf.gz')
35+
with open(tmp_input_path, 'wb') as raw_fd:
36+
with bgzip.BGZipWriter(raw_fd) as fh:
37+
fh.write(vcf_content)
38+
tabix_cmd_response = subprocess.run(['tabix', '-p', 'vcf', raw_fd.name])
39+
tabix_cmd_response.check_returncode()
40+
files_path = (raw_fd.name, raw_fd.name + ".tbi")
41+
created_files.append(files_path)
42+
return files_path
43+
created_files = list()
44+
yield created_vcf_tabix_files
45+
# cleanup
46+
try:
47+
for entry in created_files:
48+
os.remove(entry[0])
49+
os.remove(entry[1])
50+
except OSError as err:
51+
raise type(err)('Can not remove input files: {}, {}'.format(*entry)) from err

tests/data/vcf_file_mock.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
1-
"""Contains mocked file object inputs for tests."""
1+
#
2+
# Copyright 2020 NVIDIA CORPORATION.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
216

3-
import io
17+
"""Contains mocked file object inputs for tests."""
418

519

620
def mock_file_input():
721
"""Return a string stream of an unfiltered vcf file content."""
8-
return io.StringIO("""##fileformat=VCFv4.2
22+
return b"""##fileformat=VCFv4.2
923
##FILTER=<ID=PASS,Description="All filters passed">
1024
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
1125
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
@@ -28,15 +42,15 @@ def mock_file_input():
2842
1 240090 . T A 50 . DP=22;AF=0.0454545 GT:GQ 1:50
2943
1 240147 . C T 50 . DP=13;AF=0.692308 GT:GQ 1:50
3044
1 240154 . T C 50 . DP=13;AF=0.0769231 GT:GQ 1:50
31-
""")
45+
"""
3246

3347

3448
def mock_invalid_file_input():
3549
"""Returns a string stream of a vcf file content which is supposed to raise a RuntimeError.
3650
3751
More than one called sample
3852
"""
39-
return io.StringIO("""##fileformat=VCFv4.2
53+
return b"""##fileformat=VCFv4.2
4054
##FILTER=<ID=PASS,Description="All filters passed">
4155
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
4256
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
@@ -45,12 +59,12 @@ def mock_invalid_file_input():
4559
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CALLED CALLED2
4660
1 240147 . C T 50 . DP=13;AF=0.692308 GT:GQ 1:50 1/1:50
4761
1 240154 . T C 50 . DP=13;AF=0.0769231 GT:GQ 1:50 0/1:50
48-
""")
62+
"""
4963

5064

51-
def mock_vcf_file_reader_input(dummy_file_path):
65+
def mock_small_filtered_file_input():
5266
"""Return string stream of small filtered vcf content."""
53-
return io.StringIO("""##fileformat=VCFv4.2
67+
return b"""##fileformat=VCFv4.2
5468
##FILTER=<ID=PASS,Description="All filters passed">
5569
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
5670
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
@@ -60,4 +74,4 @@ def mock_vcf_file_reader_input(dummy_file_path):
6074
1 139861 . T A 50 . DP=15;AF=0.0666667 GT:GQ 0/1:50
6175
1 139976 . G A 50 . DP=35;AF=0.0185714 GT:GQ 1/1:50
6276
1 240147 . C T 50 . DP=13;AF=0.692308 GT:GQ 0/1:50
63-
""")
77+
"""

tests/test_vcf_writer.py

+18-27
Original file line numberDiff line numberDiff line change
@@ -22,45 +22,36 @@
2222
from variantworks.types import VariantZygosity
2323
from variantworks.result_writer import VCFResultWriter
2424

25-
from data.vcf_file_mock import mock_vcf_file_reader_input
25+
from data.vcf_file_mock import mock_small_filtered_file_input
2626

2727

28-
class MockPyVCFReader:
29-
original_pyvcf_reader_init_function = vcf.Reader.__init__
30-
31-
@staticmethod
32-
def new_vcf_reader_init(self, *args, **kargs):
33-
if 'filename' not in kargs: # Reader must be initiated using `filename`
34-
raise RuntimeError('Please use `filename` to initiate vcf.Reader')
35-
MockPyVCFReader.original_pyvcf_reader_init_function(
36-
self, mock_vcf_file_reader_input(kargs['filename']))
37-
38-
39-
def test_vcf_outputting(monkeypatch):
28+
def test_vcf_outputting(get_created_vcf_tabix_files):
4029
"""Write inference output into vcf files
4130
"""
42-
first_vcf_bam_tuple = VCFReader.VcfBamPath(
43-
vcf="/dummy/path1.gz", bam="temp.bam", is_fp=False)
44-
second_vcf_bam_tuple = VCFReader.VcfBamPath(
45-
vcf="/dummy/path2.gz", bam="temp.bam", is_fp=False)
46-
with monkeypatch.context() as mp:
47-
mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init)
48-
vcf_loader = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple])
31+
first_vcf_file_path, first_tabix_file_path = get_created_vcf_tabix_files(mock_small_filtered_file_input())
32+
second_vcf_file_path, second_tabix_file_path = get_created_vcf_tabix_files(mock_small_filtered_file_input())
33+
first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=first_vcf_file_path, bam=first_tabix_file_path, is_fp=False)
34+
second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=second_vcf_file_path, bam=second_tabix_file_path, is_fp=False)
35+
vcf_loader = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple])
36+
4937
inferred_results = [VariantZygosity.HOMOZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS,
5038
VariantZygosity.HETEROZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS]
5139
assert (len(inferred_results) == len(vcf_loader))
52-
with monkeypatch.context() as mp:
53-
mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init)
54-
result_writer = VCFResultWriter(vcf_loader, inferred_results)
55-
result_writer.write_output()
40+
41+
result_writer = VCFResultWriter(vcf_loader, inferred_results)
42+
result_writer.write_output()
43+
5644
# Validate output files format and make sure the outputted genotype for each record matches to the network output
45+
first_output_file_name = \
46+
'{}_{}.{}'.format("inferred", "".join(os.path.basename(first_vcf_file_path).split('.')[0:-2]), 'vcf')
47+
second_output_file_name = \
48+
'{}_{}.{}'.format("inferred", "".join(os.path.basename(second_vcf_file_path).split('.')[0:-2]), 'vcf')
5749
i = 0
58-
for f in ['inferred_path1.vcf', 'inferred_path2.vcf']:
50+
for f in [first_output_file_name, second_output_file_name]:
5951
vcf_reader = vcf.Reader(filename=os.path.join(
6052
result_writer.output_location, f))
6153
for record in vcf_reader:
62-
assert(record.samples[0]['GT'] ==
63-
result_writer.zygosity_to_vcf_genotype[inferred_results[i]])
54+
assert(record.samples[0]['GT'] == result_writer.zygosity_to_vcf_genotype[inferred_results[i]])
6455
i += 1
6556
assert (i == 6)
6657
# Clean up files

tests/test_vcfio.py

+22-56
Original file line numberDiff line numberDiff line change
@@ -15,93 +15,59 @@
1515
#
1616

1717
import pytest
18-
import vcf
1918

2019
from variantworks.io.vcfio import VCFReader
2120
from variantworks.types import VariantZygosity, Variant
2221

2322
from data.vcf_file_mock import mock_file_input, mock_invalid_file_input
2423

2524

26-
class MockPyVCFReader:
27-
original_pyvcf_reader_init_function = vcf.Reader.__init__
28-
29-
@staticmethod
30-
def new_vcf_reader_init(self, *args, **kargs):
31-
MockPyVCFReader.original_pyvcf_reader_init_function(
32-
self, mock_file_input())
33-
34-
@staticmethod
35-
def new_bad_vcf_reader_init(self, *args, **kargs):
36-
MockPyVCFReader.original_pyvcf_reader_init_function(
37-
self, mock_invalid_file_input())
38-
39-
@staticmethod
40-
def get_vcf(mp, vcf_bam_list):
41-
with mp.context() as m:
42-
# Mock vcf.Reader.__init__() return value
43-
m.setattr(vcf.Reader, "__init__",
44-
MockPyVCFReader.new_vcf_reader_init)
45-
vcf_loader = VCFReader(vcf_bam_list)
46-
return vcf_loader
47-
48-
@staticmethod
49-
def get_invalid_vcf(mp, vcf_bam_list):
50-
with mp.context() as m:
51-
# Mock vcf.Reader.__init__() return value
52-
m.setattr(vcf.Reader, "__init__",
53-
MockPyVCFReader.new_bad_vcf_reader_init)
54-
vcf_loader = VCFReader(vcf_bam_list)
55-
return vcf_loader
56-
57-
58-
def test_vcf_loader_snps(monkeypatch):
25+
def test_vcf_loader_snps(get_created_vcf_tabix_files):
5926
"""Get all variants from mocked file stream, filter SNPs, multi allele & multi samples
6027
"""
61-
vcf_bam_tuple = VCFReader.VcfBamPath(
62-
vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
63-
vcf_loader = MockPyVCFReader.get_vcf(monkeypatch, [vcf_bam_tuple])
28+
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
29+
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
30+
vcf_loader = VCFReader([vcf_bam_tuple])
6431
assert(len(vcf_loader) == 15)
6532

6633

67-
def test_vcf_fetch_variant(monkeypatch):
34+
def test_vcf_fetch_variant(get_created_vcf_tabix_files):
6835
"""Get first variant from mocked VCF file stream.
6936
"""
70-
vcf_bam_tuple = VCFReader.VcfBamPath(vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
71-
vcf_loader = MockPyVCFReader.get_vcf(monkeypatch, [vcf_bam_tuple])
37+
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
38+
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
39+
vcf_loader = VCFReader([vcf_bam_tuple])
7240
try:
7341
assert (type(vcf_loader[0]) == Variant)
7442
except IndexError:
7543
pytest.fail("Can not retrieve first element from VCFReader")
7644

7745

78-
def test_vcf_load_fp(monkeypatch):
46+
def test_vcf_load_fp(get_created_vcf_tabix_files):
7947
"""Get first variant from false positive mocked VCF file stream and check zygosity.
8048
"""
81-
vcf_bam_tuple = VCFReader.VcfBamPath(
82-
vcf="/dummy/path.gz", bam="temp.bam", is_fp=True)
83-
vcf_loader = MockPyVCFReader.get_vcf(monkeypatch, [vcf_bam_tuple])
49+
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
50+
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=True)
51+
vcf_loader = VCFReader([vcf_bam_tuple])
8452
for v in vcf_loader:
8553
assert(v.zygosity == VariantZygosity.NO_VARIANT)
8654

8755

88-
def test_vcf_load_variant_from_multiple_files(monkeypatch):
56+
def test_vcf_load_variant_from_multiple_files(get_created_vcf_tabix_files):
8957
"""Get variants from multiple mocked VCF files.
9058
"""
91-
first_vcf_bam_tuple = VCFReader.VcfBamPath(
92-
vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
93-
second_vcf_bam_tuple = VCFReader.VcfBamPath(
94-
vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
95-
vcf_loader = MockPyVCFReader.get_vcf(monkeypatch, [first_vcf_bam_tuple])
96-
vcf_loader_2x = MockPyVCFReader.get_vcf(
97-
monkeypatch, [first_vcf_bam_tuple, second_vcf_bam_tuple])
59+
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
60+
first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
61+
second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
62+
vcf_loader = VCFReader([first_vcf_bam_tuple])
63+
vcf_loader_2x = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple])
9864
assert (2 * len(vcf_loader) == len(vcf_loader_2x))
9965

10066

101-
def test_load_vcf_content_with_wrong_format(monkeypatch):
67+
def test_load_vcf_content_with_wrong_format(get_created_vcf_tabix_files):
10268
""" parse vcf file with wrong format
10369
"""
104-
vcf_bam_tuple = VCFReader.VcfBamPath(
105-
vcf="/dummy/path.gz", bam="temp.bam", is_fp=False)
70+
vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_invalid_file_input())
71+
vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
10672
with pytest.raises(RuntimeError):
107-
MockPyVCFReader.get_invalid_vcf(monkeypatch, [vcf_bam_tuple])
73+
VCFReader([vcf_bam_tuple])

variantworks/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#
2+
# Copyright 2020 NVIDIA CORPORATION.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
"""Init file for variantworks."""

variantworks/io/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#
2+
# Copyright 2020 NVIDIA CORPORATION.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
"""Init file for I/O module."""

0 commit comments

Comments
 (0)