|
9 | 9 | import logging
|
10 | 10 | import pathlib
|
11 | 11 |
|
| 12 | +import tskit |
12 | 13 | import tqdm
|
13 | 14 | import pandas as pd
|
14 | 15 | import zarr
|
@@ -316,6 +317,44 @@ def reorder(self, path, additional_fields=list(), show_progress=False):
|
316 | 317 | index = np.lexsort([self.metadata.fields[f] for f in sort_key[::-1]])
|
317 | 318 | self.copy(path, sample_id=sample_id[index], show_progress=show_progress)
|
318 | 319 |
|
| 320 | + def map_deletions_to_ts(self, ts, start, end): |
| 321 | + """ |
| 322 | + Map a region of the dataset onto the specified tree sequence, |
| 323 | + adding deletions to the tree sequence where necessary. |
| 324 | + The tree sequence samples must all be present in the dataset. |
| 325 | + A new tree sequence is returned whose original mutations in the |
| 326 | + region have been removed and replaced using parsimony |
| 327 | + """ |
| 328 | + sample_id = ts.metadata["sc2ts"]["samples_strain"] |
| 329 | + if sample_id[0].startswith("Wuhan"): |
| 330 | + # Note: a lot of fiddling around here is due to the potential for sample 0 |
| 331 | + # to be the reference, which is not in the viridian dataset. |
| 332 | + sample_id = sample_id[1:] |
| 333 | + tables = ts.dump_tables() |
| 334 | + n_tm = ts.nodes_time |
| 335 | + tree = ts.first() |
| 336 | + del_sites = [ts.site(position=p).id for p in range(start, end)] |
| 337 | + tables.mutations.keep_rows(np.logical_not(np.isin(ts.mutations_site, del_sites))) |
| 338 | + for var in self.variants(sample_id, np.arange(start, end)): |
| 339 | + tree.seek(var.position) |
| 340 | + site = ts.site(position=var.position) |
| 341 | + # treat non-nucleotide alleles (other IUPAC codes) as missing , but keep "-" |
| 342 | + keep = np.isin(var.alleles, np.array(['A', 'C', 'G', 'T', '-'])) |
| 343 | + g = var.genotypes.copy() |
| 344 | + g[keep[g] == False] = tskit.MISSING_DATA |
| 345 | + anc = site.ancestral_state |
| 346 | + # Pad the start with the anc state (non-viridian) Wuhan sample, so add that |
| 347 | + if len(g) < ts.num_samples: |
| 348 | + g = np.append([list(var.alleles).index(anc)], g) |
| 349 | + _, mutations = tree.map_mutations(g, list(var.alleles), ancestral_state=anc) |
| 350 | + m_id_map = {tskit.NULL: tskit.NULL} |
| 351 | + for list_id, m in enumerate(mutations): |
| 352 | + m_id_map[list_id] = tables.mutations.append( |
| 353 | + m.replace(site=site.id, parent=m_id_map[m.parent], time=n_tm[m.node]) |
| 354 | + ) |
| 355 | + tables.sort() |
| 356 | + return tables.tree_sequence() |
| 357 | + |
319 | 358 | @staticmethod
|
320 | 359 | def new(path, samples_chunk_size=None, variants_chunk_size=None):
|
321 | 360 |
|
|
0 commit comments