Skip to content

Commit e52da3d

Browse files
committed
Remove contig_id
1 parent 41cc370 commit e52da3d

File tree

5 files changed

+70
-116
lines changed

5 files changed

+70
-116
lines changed

CHANGELOG.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,8 @@
1212
- If a mismatch ratio is provided to the `infer` command, it only applies during the
1313
`match_samples` phase ({issue}`980`, {pr}`981`, {user}`hyanwong`)
1414

15-
- Allow a contig to be selected by name (`contig_id`), and get the `sequence_length`
16-
of the contig associated with the unmasked sites, if contig lengths are provided
17-
({pr}`964`, {user}`hyanwong`)
15+
- Get the `sequence_length` of the contig associated with the unmasked sites,
16+
if contig lengths are provided ({pr}`964`, {user}`hyanwong`, {user}`benjeffery`)
1817

1918
**Fixes**
2019

docs/usage.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ It is also possible to *completely* exclude sites and samples, by specifing a bo
109109
a mask value of `True` will be completely omitted both from inference and the final tree sequence.
110110
This can be useful, for example, if you wish to select only a subset of the chromosome for
111111
inference, e.g. to reduce computational load. You can also use it to subset inference to a
112-
particular contig, if your dataset contains multiple contigs (although this can be more easily
113-
done using the `contig_id` parameter). Note that if a `site_mask` is provided,
112+
particular contig, if your dataset contains multiple contigs. Note that if a `site_mask` is provided,
114113
the ancestral states array should only specify alleles for the unmasked sites.
115114

116115
Below, for instance, is an example of including only sites up to position six in the contig

tests/test_variantdata.py

Lines changed: 32 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_variantdata_accessors(tmp_path, in_mem):
145145
assert vd.format_name == "tsinfer-variant-data"
146146
assert vd.format_version == (0, 1)
147147
assert vd.finalised
148-
assert vd.sequence_length == ts.sequence_length + 1337
148+
assert vd.sequence_length == ts.sequence_length
149149
assert vd.num_sites == ts.num_sites
150150
assert vd.sites_metadata_schema == ts.tables.sites.metadata_schema.schema
151151
assert vd.sites_metadata == [site.metadata for site in ts.sites()]
@@ -317,6 +317,11 @@ def test_simulate_genotype_call_dataset_length(tmp_path):
317317
vdata = tsinfer.VariantData(tmp_path, ds["variant_allele"][:, 0].values.astype(str))
318318
assert vdata.sequence_length == ts.sites_position[-1] + 1
319319

320+
vdata = tsinfer.VariantData(
321+
tmp_path, ds["variant_allele"][:, 0].values.astype(str), sequence_length=1337
322+
)
323+
assert vdata.sequence_length == 1337
324+
320325

321326
class TestMultiContig:
322327
def make_two_ts_dataset(self, path):
@@ -359,37 +364,44 @@ def test_mask(self, tmp_path):
359364
assert vdata.sequence_length == ts2.sequence_length
360365

361366
@pytest.mark.parametrize("contig_id", ["chr1", "chr2"])
362-
def test_contig_id_param(self, contig_id, tmp_path):
367+
def test_multi_contig(self, contig_id, tmp_path):
363368
tree_seqs = {}
364369
tree_seqs["chr1"], tree_seqs["chr2"] = self.make_two_ts_dataset(tmp_path)
370+
with pytest.raises(ValueError, match="multiple contigs"):
371+
vdata = tsinfer.VariantData(tmp_path, "variant_ancestral_allele")
372+
root = zarr.open(tmp_path)
373+
mask = root["variant_contig"][:] == (1 if contig_id == "chr1" else 0)
365374
vdata = tsinfer.VariantData(
366-
tmp_path, "variant_ancestral_allele", contig_id=contig_id
375+
tmp_path, "variant_ancestral_allele", site_mask=mask
367376
)
368377
assert np.all(tree_seqs[contig_id].sites_position == vdata.sites_position)
369378
assert vdata.contig_id == contig_id
379+
assert vdata._contig_index == (0 if contig_id == "chr1" else 1)
370380
assert vdata.sequence_length == tree_seqs[contig_id].sequence_length
371381

372-
def test_contig_id_param_and_mask(self, tmp_path):
382+
def test_mixed_contigs_error(self, tmp_path):
373383
ts1, ts2 = self.make_two_ts_dataset(tmp_path)
374-
vdata = tsinfer.VariantData(
375-
tmp_path,
376-
"variant_ancestral_allele",
377-
site_mask=np.array(
378-
(ts1.num_sites + 1) * [True] + (ts2.num_sites - 1) * [False]
379-
),
380-
contig_id="chr2",
381-
)
382-
assert np.all(ts2.sites_position[1:] == vdata.sites_position)
383-
assert vdata.contig_id == "chr2"
384+
mask = np.ones(ts1.num_sites + ts2.num_sites)
385+
# Select two varaints, one from each contig
386+
mask[0] = False
387+
mask[-1] = False
388+
with pytest.raises(ValueError, match="multiple contigs"):
389+
tsinfer.VariantData(
390+
tmp_path,
391+
"variant_ancestral_allele",
392+
site_mask=mask,
393+
)
384394

385-
@pytest.mark.parametrize("contig_id", ["chr1", "chr2"])
386-
def test_contig_length(self, contig_id, tmp_path):
387-
tree_seqs = {}
388-
tree_seqs["chr1"], tree_seqs["chr2"] = self.make_two_ts_dataset(tmp_path)
395+
def test_no_variant_contig(self, tmp_path):
396+
ts1, ts2 = self.make_two_ts_dataset(tmp_path)
397+
root = zarr.open(tmp_path)
398+
del root["variant_contig"]
399+
mask = np.ones(ts1.num_sites + ts2.num_sites)
400+
mask[0] = False
389401
vdata = tsinfer.VariantData(
390-
tmp_path, "variant_ancestral_allele", contig_id=contig_id
402+
tmp_path, "variant_ancestral_allele", site_mask=mask
391403
)
392-
assert vdata.sequence_length == tree_seqs[contig_id].sequence_length
404+
assert vdata.sequence_length == ts1.sites_position[0] + 1
393405

394406

395407
@pytest.mark.skipif(sys.platform == "win32", reason="File permission errors on Windows")
@@ -953,23 +965,6 @@ def test_unimplemented_from_tree_sequence(self):
953965
with pytest.raises(NotImplementedError):
954966
tsinfer.VariantData.from_tree_sequence(None)
955967

956-
def test_multiple_contigs(self, tmp_path):
957-
path = tmp_path / "data.zarr"
958-
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
959-
ds["contig_id"] = (
960-
ds["contig_id"].dims,
961-
np.array(["c10", "c11"], dtype="<U3"),
962-
)
963-
ds["variant_contig"] = (
964-
ds["variant_contig"].dims,
965-
np.array([0, 0, 1], dtype=ds["variant_contig"].dtype),
966-
)
967-
sgkit.save_dataset(ds, path)
968-
with pytest.raises(
969-
ValueError, match=r'Sites belong to multiple contigs \("c10", "c11"\)'
970-
):
971-
tsinfer.VariantData(path, ds["variant_allele"][:, 0].astype(str))
972-
973968
def test_all_masked(self, tmp_path):
974969
path = tmp_path / "data.zarr"
975970
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
@@ -979,28 +974,6 @@ def test_all_masked(self, tmp_path):
979974
path, ds["variant_allele"][:, 0].astype(str), site_mask=np.ones(3, bool)
980975
)
981976

982-
def test_bad_contig_param(self, tmp_path):
983-
path = tmp_path / "data.zarr"
984-
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
985-
sgkit.save_dataset(ds, path)
986-
with pytest.raises(ValueError, match='"XX" not found'):
987-
tsinfer.VariantData(
988-
path, ds["variant_allele"][:, 0].astype(str), contig_id="XX"
989-
)
990-
991-
def test_multiple_contig_param(self, tmp_path):
992-
path = tmp_path / "data.zarr"
993-
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)
994-
ds["contig_id"] = (
995-
ds["contig_id"].dims,
996-
np.array(["chr1", "chr1"], dtype="<U4"),
997-
)
998-
sgkit.save_dataset(ds, path)
999-
with pytest.raises(ValueError, match='Multiple contigs named "chr1"'):
1000-
tsinfer.VariantData(
1001-
path, ds["variant_allele"][:, 0].astype(str), contig_id="chr1"
1002-
)
1003-
1004977
def test_missing_sites_time(self, tmp_path):
1005978
path = tmp_path / "data.zarr"
1006979
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True)

tests/tsutil.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,11 +339,6 @@ def _make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
339339
)
340340

341341
if add_optional:
342-
add_attribute_to_dataset(
343-
"sequence_length",
344-
ts.sequence_length + 1337,
345-
path / "data.zarr",
346-
)
347342
sites_md = tables.sites.metadata
348343
sites_md_offset = tables.sites.metadata_offset
349344
add_array_to_dataset(

tsinfer/formats.py

Lines changed: 35 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2328,8 +2328,7 @@ class VariantData(SampleData):
23282328
:param Union(array, str) site_mask: A numpy array of booleans specifying which
23292329
sites to mask out (exclude) from the dataset. Alternatively, a string
23302330
can be provided, giving the name of an array in the input dataset which contains
2331-
the site mask. If ``None`` (default), all sites are included (unless restricted
2332-
to a ``contig_id``, see below).
2331+
the site mask. If ``None`` (default), all sites are included.
23332332
:param Union(array, str) sites_time: A numpy array of floats specifying the relative
23342333
time of occurrence of the mutation to the derived state at each site. This must
23352334
be of the same length as the number of unmasked sites. Alternatively, a
@@ -2339,11 +2338,11 @@ class VariantData(SampleData):
23392338
reasonable approximation to the relative order of ancestors used for inference.
23402339
Time values are ignored for sites not used in inference, such as singletons,
23412340
sites with more than two alleles, or sites with an unknown ancestral state.
2342-
:param str contig_id: The name of the contig to use (e.g. "chr1"), if the .vcz file
2343-
contains multiple contigs; contig names can be found in the `.contig_id array
2344-
of the input dataset. If provided, sites associated with any other contigs will
2345-
be added to the sites that are masked out. If ``None`` (default), do not mark
2346-
out sites on the basis of their contig ID.
2341+
:param int sequence_length: An integer specifying the resulting `sequence_length`
2342+
attribute of the output tree sequence. If not specified the `contig_length`
2343+
attribute from the undelying zarr store for the contig of the used variants.
2344+
If that is not present then the maximum position plus one of the used variants
2345+
is used.
23472346
"""
23482347

23492348
FORMAT_NAME = "tsinfer-variant-data"
@@ -2357,8 +2356,10 @@ def __init__(
23572356
sample_mask=None,
23582357
site_mask=None,
23592358
sites_time=None,
2360-
contig_id=None,
2359+
sequence_length=None,
23612360
):
2361+
self._sequence_length = sequence_length
2362+
self._contig_index = None
23622363
try:
23632364
if len(path_or_zarr.call_genotype.shape) == 3:
23642365
# Assumed to be a VCF Zarr hierarchy
@@ -2391,24 +2392,15 @@ def __init__(
23912392
raise ValueError(
23922393
"Site mask array must be the same length as the number of unmasked sites"
23932394
)
2394-
if contig_id is not None:
2395-
contig_index = np.where(self.data.contig_id[:] == contig_id)[0]
2396-
if len(contig_index) == 0:
2397-
raise ValueError(
2398-
f'"{contig_id}" not found among the available contig IDs: '
2399-
+ ",".join(f"{n}" for n in self.data.contig_id[:])
2400-
)
2401-
elif len(contig_index) > 1:
2402-
raise ValueError(f'Multiple contigs named "{contig_id}"')
2403-
contig_index = contig_index[0]
2404-
site_mask = np.logical_or(
2405-
site_mask, self.data["variant_contig"][:] != contig_index
2406-
)
24072395

24082396
# We negate the mask as it is much easier in numpy to have True=keep
24092397
self.sites_select = ~site_mask.astype(bool)
2398+
24102399
if np.sum(self.sites_select) == 0:
2411-
raise ValueError("All sites have been masked out. Please unmask some")
2400+
raise ValueError(
2401+
"All sites have been masked out, at least one value"
2402+
"must be 'False' in the site mask"
2403+
)
24122404

24132405
if sample_mask is None:
24142406
sample_mask = np.full(self._num_individuals_before_mask, False, dtype=bool)
@@ -2439,18 +2431,19 @@ def __init__(
24392431
" unphased"
24402432
)
24412433

2442-
used_contigs = self.data.variant_contig[:][self.sites_select]
2443-
self._contig_index = used_contigs[0]
2444-
self._contig_id = self.data.contig_id[self._contig_index]
2434+
if "variant_contig" in self.data:
2435+
used_contigs = self.data.variant_contig[:][self.sites_select]
2436+
self._contig_index = used_contigs[0]
2437+
self._contig_id = self.data.contig_id[self._contig_index]
24452438

2446-
if np.any(used_contigs != self._contig_index):
2447-
contig_names = ", ".join(
2448-
f'"{self.data.contig_id[c]}"' for c in np.unique(used_contigs)
2449-
)
2450-
raise ValueError(
2451-
f"Sites belong to multiple contigs ({contig_names}). Please restrict "
2452-
"sites to one contig e.g. via the `contig_id` argument."
2453-
)
2439+
if np.any(used_contigs != self._contig_index):
2440+
contig_names = ", ".join(
2441+
f'"{self.data.contig_id[c]}"' for c in np.unique(used_contigs)
2442+
)
2443+
raise ValueError(
2444+
f"Sites belong to multiple contigs ({contig_names}). "
2445+
"Please restrict sites to one contig using the sites_mask argument."
2446+
)
24542447

24552448
if np.any(np.diff(self.sites_position) <= 0):
24562449
raise ValueError(
@@ -2558,26 +2551,21 @@ def finalised(self):
25582551
def sequence_length(self):
25592552
"""
25602553
The sequence length of the contig associated with sites used in the dataset.
2561-
If the dataset has a "sequence_length" attribute, this is always used, otherwise
2562-
if the dataset has recorded contig lengths, the appropriate length is taken,
2563-
otherwise the length is calculated from the maximum variant position plus one.
2554+
If set manually then that value is used else if the dataset has recorded
2555+
contig lengths use that else the length is calculated from the maximum
2556+
variant position plus one.
25642557
"""
2565-
try:
2566-
return self.data.attrs["sequence_length"]
2567-
except KeyError:
2568-
if self._contig_index is not None:
2569-
try:
2570-
if self._contig_index < len(self.data.contig_length):
2571-
return self.data.contig_length[self._contig_index]
2572-
except AttributeError:
2573-
pass # contig_length is optional, fall back to calculating length
2574-
return int(np.max(self.data["variant_position"])) + 1
2558+
if self._sequence_length is not None:
2559+
return self._sequence_length
2560+
if self._contig_index is not None and "contig_length" in self.data:
2561+
return self.data.contig_length[self._contig_index]
2562+
return int(np.max(self.sites_position)) + 1
25752563

25762564
@property
25772565
def contig_id(self):
25782566
"""
25792567
The contig ID (name) for all used sites, or None if no
2580-
contig IDs were provided
2568+
contig IDs were present in the zarr dataset
25812569
"""
25822570
return self._contig_id
25832571

0 commit comments

Comments
 (0)