Skip to content

Commit c78492c

Browse files
committed
Limit linesweep to 1M ancestors
1 parent 7db1d38 commit c78492c

File tree

1 file changed

+25
-3
lines changed

1 file changed

+25
-3
lines changed

tsinfer/inference.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,6 +1644,19 @@ def group_by_linesweep(self):
16441644
epoch_end = np.hstack([breaks + 1, [self.num_ancestors]])
16451645
time_slices = np.vstack([epoch_start, epoch_end]).T
16461646
epoch_sizes = time_slices[:, 1] - time_slices[:, 0]
1647+
# Find the epoch where the sum of ancestors has reached 1M as a cutoff
1648+
if np.sum(epoch_sizes) <= 1e6:
1649+
over_1M_epoch = len(time_slices)
1650+
over_1M_epoch_first_ancestor = self.num_ancestors
1651+
else:
1652+
over_1M_epoch = np.where(np.cumsum(epoch_sizes) > 1e6)[0][0]
1653+
over_1M_epoch_first_ancestor = time_slices[over_1M_epoch, 0]
1654+
logger.info(
1655+
f"1M ancestors reached at {over_1M_epoch} epoch and ancestor "
1656+
f"{over_1M_epoch_first_ancestor}"
1657+
)
1658+
1659+
# Find the first epoch with more than a 500 times the median epoch size
16471660
median_size = np.median(epoch_sizes)
16481661
cutoff = 500 * median_size
16491662
# Zero out the first half so that an initial large epoch doesn't
@@ -1653,13 +1666,17 @@ def group_by_linesweep(self):
16531666
# the median epoch size. For a large set of human genomes the median epoch
16541667
# size is around 10, so we'll stop grouping by linesweep at 5000.
16551668
if np.max(epoch_sizes) <= cutoff:
1656-
large_epoch = len(time_slices)
1657-
large_epoch_first_ancestor = self.num_ancestors
1669+
large_epoch = over_1M_epoch
1670+
large_epoch_first_ancestor = over_1M_epoch_first_ancestor
1671+
logger.info("No large epochs found, using count cutoff")
16581672
else:
16591673
large_epoch = np.where(epoch_sizes > cutoff)[0][0]
16601674
large_epoch_first_ancestor = time_slices[large_epoch, 0]
1675+
logger.info(
1676+
f"Large epoch found at {large_epoch} with {epoch_sizes[large_epoch]} "
1677+
f"ancestors and ancestor {large_epoch_first_ancestor}"
1678+
)
16611679
logger.info(f"{len(time_slices)} epochs with {median_size} median size.")
1662-
logger.info(f"First large (>{cutoff}) epoch is {large_epoch}")
16631680
logger.info(f"Grouping {large_epoch_first_ancestor} ancestors by linesweep")
16641681
ancestor_grouping = ancestors.group_ancestors_by_linesweep(
16651682
start[:large_epoch_first_ancestor],
@@ -1672,6 +1689,11 @@ def group_by_linesweep(self):
16721689
ancestor_grouping[next_epoch] = np.arange(*time_slices[epoch])
16731690
next_epoch += 1
16741691

1692+
# Assert that every ancestor appears once in ancestor grouping
1693+
assert (
1694+
len(set(np.hstack(list(ancestor_grouping.values())))) == self.num_ancestors
1695+
)
1696+
16751697
# Remove the "virtual root" ancestor
16761698
try:
16771699
assert 0 in ancestor_grouping[0]

0 commit comments

Comments
 (0)