Skip to content

Commit add1e46

Browse files
hyanwongbenjeffery
authored andcommitted
Check duplicate positions
Fixes #888
1 parent 92b3a37 commit add1e46

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

tests/test_sgkit.py

+16
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,22 @@ def test_ploidy1_unphased(self, tmp_path):
600600
sgkit.save_dataset(ds, path)
601601
tsinfer.SgkitSampleData(path)
602602

603+
def test_duplicate_positions(self, tmp_path):
604+
path = tmp_path / "data.zarr"
605+
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
606+
ds["variant_position"][2] = ds["variant_position"][1]
607+
sgkit.save_dataset(ds, path)
608+
with pytest.raises(ValueError, match="duplicate or out-of-order values"):
609+
tsinfer.SgkitSampleData(path)
610+
611+
def test_bad_order_positions(self, tmp_path):
612+
path = tmp_path / "data.zarr"
613+
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
614+
ds["variant_position"][0] = ds["variant_position"][2] - 0.5
615+
sgkit.save_dataset(ds, path)
616+
with pytest.raises(ValueError, match="duplicate or out-of-order values"):
617+
tsinfer.SgkitSampleData(path)
618+
603619
def test_empty_alleles_not_at_end(self, tmp_path):
604620
path = tmp_path / "data.zarr"
605621
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_ploidy=1)

tsinfer/formats.py

+6
Original file line numberDiff line numberDiff line change
@@ -2309,6 +2309,12 @@ def __init__(self, path):
23092309
" sgkit dataset, indicating that all the genotypes are"
23102310
" unphased"
23112311
)
2312+
if np.any(np.diff(self.sites_position) <= 0):
2313+
raise ValueError(
2314+
"Values taken from the variant_position array are not strictly "
2315+
"increasing (i.e. have duplicate or out-of-order values). "
2316+
"These must be masked out to run tsinfer."
2317+
)
23122318

23132319
@functools.cached_property
23142320
def format_name(self):

0 commit comments

Comments
 (0)