Skip to content

Commit 3068a36

Browse files
Merge pull request #471 from jeromekelleher/map-deletions
Mapping deletions with metadata
2 parents 3158f29 + 2e1ceea commit 3068a36

File tree

3 files changed

+116
-38
lines changed

3 files changed

+116
-38
lines changed

Diff for: sc2ts/dataset.py

-38
Original file line numberDiff line numberDiff line change
@@ -317,44 +317,6 @@ def reorder(self, path, additional_fields=list(), show_progress=False):
317317
index = np.lexsort([self.metadata.fields[f] for f in sort_key[::-1]])
318318
self.copy(path, sample_id=sample_id[index], show_progress=show_progress)
319319

320-
def map_deletions_to_ts(self, ts, start, end):
321-
"""
322-
Map a region of the dataset onto the specified tree sequence,
323-
adding deletions to the tree sequence where necessary.
324-
The tree sequence samples must all be present in the dataset.
325-
A new tree sequence is returned whose original mutations in the
326-
region have been removed and replaced using parsimony
327-
"""
328-
sample_id = ts.metadata["sc2ts"]["samples_strain"]
329-
if sample_id[0].startswith("Wuhan"):
330-
# Note: a lot of fiddling around here is due to the potential for sample 0
331-
# to be the reference, which is not in the viridian dataset.
332-
sample_id = sample_id[1:]
333-
tables = ts.dump_tables()
334-
n_tm = ts.nodes_time
335-
tree = ts.first()
336-
del_sites = [ts.site(position=p).id for p in range(start, end)]
337-
tables.mutations.keep_rows(np.logical_not(np.isin(ts.mutations_site, del_sites)))
338-
for var in self.variants(sample_id, np.arange(start, end)):
339-
tree.seek(var.position)
340-
site = ts.site(position=var.position)
341-
# treat non-nucleotide alleles (other IUPAC codes) as missing , but keep "-"
342-
keep = np.isin(var.alleles, np.array(['A', 'C', 'G', 'T', '-']))
343-
g = var.genotypes.copy()
344-
g[keep[g] == False] = tskit.MISSING_DATA
345-
anc = site.ancestral_state
346-
# Pad the start with the anc state (non-viridian) Wuhan sample, so add that
347-
if len(g) < ts.num_samples:
348-
g = np.append([list(var.alleles).index(anc)], g)
349-
_, mutations = tree.map_mutations(g, list(var.alleles), ancestral_state=anc)
350-
m_id_map = {tskit.NULL: tskit.NULL}
351-
for list_id, m in enumerate(mutations):
352-
m_id_map[list_id] = tables.mutations.append(
353-
m.replace(site=site.id, parent=m_id_map[m.parent], time=n_tm[m.node])
354-
)
355-
tables.sort()
356-
return tables.tree_sequence()
357-
358320
@staticmethod
359321
def new(path, samples_chunk_size=None, variants_chunk_size=None):
360322

Diff for: sc2ts/inference.py

+80
Original file line numberDiff line numberDiff line change
@@ -1945,3 +1945,83 @@ def get_recombinant_strains(ts):
19451945
group_id = node.metadata["sc2ts"]["group_id"]
19461946
ret[u] = groups[group_id]
19471947
return ret
1948+
1949+
1950+
def map_deletions(ts, ds, *, frequency_threshold, show_progress=False):
1951+
"""
1952+
Map deletions at sites that exceed the specified frequency threshold onto the
1953+
ARG using parsimony (excluding flanks).
1954+
"""
1955+
genes = core.get_gene_coordinates()
1956+
start = genes["ORF1ab"][0]
1957+
end = genes["ORF10"][1]
1958+
1959+
del_sites = []
1960+
keep_mutations = np.ones(ts.num_mutations, dtype=bool)
1961+
for site in ts.sites():
1962+
if start <= site.position < end:
1963+
deletion_samples = site.metadata["sc2ts"]["deletion_samples"]
1964+
if deletion_samples / ts.num_samples >= frequency_threshold:
1965+
del_sites.append(site.id)
1966+
site_start = np.searchsorted(ts.mutations_site, site.id, side="left")
1967+
site_stop = np.searchsorted(ts.mutations_site, site.id, side="right")
1968+
keep_mutations[site_start:site_stop] = False
1969+
1970+
sample_id = ts.metadata["sc2ts"]["samples_strain"]
1971+
assert not sample_id[0].startswith("Wuhan")
1972+
1973+
tables = ts.dump_tables()
1974+
tree = ts.first()
1975+
1976+
tables.mutations.keep_rows(keep_mutations)
1977+
1978+
variants = get_progress(
1979+
ds.variants(sample_id, ts.sites_position[del_sites]),
1980+
title="Map deletions",
1981+
phase="",
1982+
show_progress=show_progress,
1983+
total=len(del_sites),
1984+
)
1985+
1986+
mut_metadata = {"sc2ts": {"type": "post_parsimony"}}
1987+
site_metadata = {}
1988+
for var in variants:
1989+
tree.seek(var.position)
1990+
site = ts.site(position=var.position)
1991+
old_mutations = []
1992+
for mut in site.mutations:
1993+
assert not keep_mutations[mut.id]
1994+
old_mutations.append(
1995+
{
1996+
"node": mut.node,
1997+
"derived_state": mut.derived_state,
1998+
"metadata": mut.metadata,
1999+
}
2000+
)
2001+
md = dict(site.metadata)
2002+
md["sc2ts"]["original_mutations"] = old_mutations
2003+
site_metadata[site.id] = md
2004+
g = mask_ambiguous(var.genotypes)
2005+
deletion_samples = site.metadata["sc2ts"]["deletion_samples"]
2006+
assert deletion_samples == np.sum(g == DELETION)
2007+
_, mutations = tree.map_mutations(
2008+
g, list(var.alleles), ancestral_state=site.ancestral_state
2009+
)
2010+
for m in mutations:
2011+
tables.mutations.add_row(
2012+
site=site.id,
2013+
node=m.node,
2014+
derived_state=m.derived_state,
2015+
time=ts.nodes_time[m.node],
2016+
metadata=mut_metadata,
2017+
)
2018+
2019+
tables.sites.clear()
2020+
for site in ts.sites():
2021+
md = site_metadata.get(site.id, site.metadata)
2022+
tables.sites.append(site.replace(metadata=md))
2023+
2024+
tables.sort()
2025+
tables.build_index()
2026+
tables.compute_mutation_parents()
2027+
return tables.tree_sequence()

Diff for: tests/test_inference.py

+36
Original file line numberDiff line numberDiff line change
@@ -1376,3 +1376,39 @@ def test_one_leaf_mutation(self, samples, result):
13761376
tables.mutations.add_row(site=0, node=3, derived_state="T")
13771377
ts = tables.tree_sequence()
13781378
nt.assert_array_equal(sc2ts.extract_haplotypes(ts, samples), result)
1379+
1380+
1381+
class TestMapDeletions:
1382+
def test_example(self, fx_ts_map, fx_dataset):
1383+
ts = fx_ts_map["2020-02-13"]
1384+
new_ts = sc2ts.map_deletions(ts, fx_dataset, frequency_threshold=0.001)
1385+
remapped_sites = [
1386+
j
1387+
for j in range(ts.num_sites)
1388+
if "original_mutations" in new_ts.site(j).metadata["sc2ts"]
1389+
]
1390+
assert remapped_sites == [1541, 3945, 3946, 3947]
1391+
1392+
for site_id in remapped_sites:
1393+
site = new_ts.site(site_id)
1394+
old_site = ts.site(site_id)
1395+
original_mutations = site.metadata["sc2ts"]["original_mutations"]
1396+
assert original_mutations == [
1397+
{
1398+
"node": mutation.node,
1399+
"derived_state": mutation.derived_state,
1400+
"metadata": mutation.metadata,
1401+
}
1402+
for mutation in old_site.mutations
1403+
]
1404+
d = site.metadata["sc2ts"]
1405+
del d["original_mutations"]
1406+
assert old_site.metadata["sc2ts"] == d
1407+
1408+
for mut in site.mutations:
1409+
assert mut.metadata["sc2ts"]["type"] == "post_parsimony"
1410+
1411+
def test_validate(self, fx_ts_map, fx_dataset):
1412+
ts = fx_ts_map["2020-02-13"]
1413+
new_ts = sc2ts.map_deletions(ts, fx_dataset, frequency_threshold=0.001)
1414+
sc2ts.validate(new_ts, fx_dataset, deletions_as_missing=False)

0 commit comments

Comments
 (0)