Skip to content

Commit 393636f

Browse files
hyanwongjeromekelleher
authored andcommitted
Add map_deletions_to_ts as Dataset method
1 parent a505f5a commit 393636f

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

sc2ts/dataset.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import pathlib
1111

12+
import tskit
1213
import tqdm
1314
import pandas as pd
1415
import zarr
@@ -317,6 +318,44 @@ def reorder(self, path, additional_fields=list(), show_progress=False):
317318
index = np.lexsort([self.metadata.fields[f] for f in sort_key[::-1]])
318319
self.copy(path, sample_id=sample_id[index], show_progress=show_progress)
319320

321+
def map_deletions_to_ts(self, ts, start, end):
322+
"""
323+
Map a region of the dataset onto the specified tree sequence,
324+
adding deletions to the tree sequence where necessary.
325+
The tree sequence samples must all be present in the dataset.
326+
A new tree sequence is returned whose original mutations in the
327+
region have been removed and replaced using parsimony
328+
"""
329+
sample_id = ts.metadata["sc2ts"]["samples_strain"]
330+
if sample_id[0].startswith("Wuhan"):
331+
# Note: a lot of fiddling around here is due to the potential for sample 0
332+
# to be the reference, which is not in the viridian dataset.
333+
sample_id = sample_id[1:]
334+
tables = ts.dump_tables()
335+
n_tm = ts.nodes_time
336+
tree = ts.first()
337+
del_sites = [ts.site(position=p).id for p in range(start, end)]
338+
tables.mutations.keep_rows(np.logical_not(np.isin(ts.mutations_site, del_sites)))
339+
for var in self.variants(sample_id, np.arange(start, end)):
340+
tree.seek(var.position)
341+
site = ts.site(position=var.position)
342+
# treat non-nucleotide alleles (other IUPAC codes) as missing , but keep "-"
343+
keep = np.isin(var.alleles, np.array(['A', 'C', 'G', 'T', '-']))
344+
g = var.genotypes.copy()
345+
g[keep[g] == False] = tskit.MISSING_DATA
346+
anc = site.ancestral_state
347+
# Pad the start with the anc state (non-viridian) Wuhan sample, so add that
348+
if len(g) < ts.num_samples:
349+
g = np.append([list(var.alleles).index(anc)], g)
350+
_, mutations = tree.map_mutations(g, list(var.alleles), ancestral_state=anc)
351+
m_id_map = {tskit.NULL: tskit.NULL}
352+
for list_id, m in enumerate(mutations):
353+
m_id_map[list_id] = tables.mutations.append(
354+
m.replace(site=site.id, parent=m_id_map[m.parent], time=n_tm[m.node])
355+
)
356+
tables.sort()
357+
return tables.tree_sequence()
358+
320359
@staticmethod
321360
def new(path, samples_chunk_size=None, variants_chunk_size=None):
322361

0 commit comments

Comments
 (0)