Skip to content

Commit 3158f29

Browse files
Merge pull request #429 from hyanwong/remap
Add map_deletions_to_ts as Dataset method
2 parents 85b9c12 + 393636f commit 3158f29

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

sc2ts/dataset.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import pathlib
1111

12+
import tskit
1213
import tqdm
1314
import pandas as pd
1415
import zarr
@@ -316,6 +317,44 @@ def reorder(self, path, additional_fields=list(), show_progress=False):
316317
index = np.lexsort([self.metadata.fields[f] for f in sort_key[::-1]])
317318
self.copy(path, sample_id=sample_id[index], show_progress=show_progress)
318319

320+
def map_deletions_to_ts(self, ts, start, end):
321+
"""
322+
Map a region of the dataset onto the specified tree sequence,
323+
adding deletions to the tree sequence where necessary.
324+
The tree sequence samples must all be present in the dataset.
325+
A new tree sequence is returned whose original mutations in the
326+
region have been removed and replaced using parsimony
327+
"""
328+
sample_id = ts.metadata["sc2ts"]["samples_strain"]
329+
if sample_id[0].startswith("Wuhan"):
330+
# Note: a lot of fiddling around here is due to the potential for sample 0
331+
# to be the reference, which is not in the viridian dataset.
332+
sample_id = sample_id[1:]
333+
tables = ts.dump_tables()
334+
n_tm = ts.nodes_time
335+
tree = ts.first()
336+
del_sites = [ts.site(position=p).id for p in range(start, end)]
337+
tables.mutations.keep_rows(np.logical_not(np.isin(ts.mutations_site, del_sites)))
338+
for var in self.variants(sample_id, np.arange(start, end)):
339+
tree.seek(var.position)
340+
site = ts.site(position=var.position)
341+
# treat non-nucleotide alleles (other IUPAC codes) as missing , but keep "-"
342+
keep = np.isin(var.alleles, np.array(['A', 'C', 'G', 'T', '-']))
343+
g = var.genotypes.copy()
344+
g[keep[g] == False] = tskit.MISSING_DATA
345+
anc = site.ancestral_state
346+
# Pad the start with the anc state (non-viridian) Wuhan sample, so add that
347+
if len(g) < ts.num_samples:
348+
g = np.append([list(var.alleles).index(anc)], g)
349+
_, mutations = tree.map_mutations(g, list(var.alleles), ancestral_state=anc)
350+
m_id_map = {tskit.NULL: tskit.NULL}
351+
for list_id, m in enumerate(mutations):
352+
m_id_map[list_id] = tables.mutations.append(
353+
m.replace(site=site.id, parent=m_id_map[m.parent], time=n_tm[m.node])
354+
)
355+
tables.sort()
356+
return tables.tree_sequence()
357+
319358
@staticmethod
320359
def new(path, samples_chunk_size=None, variants_chunk_size=None):
321360

0 commit comments

Comments
 (0)