@@ -1644,6 +1644,19 @@ def group_by_linesweep(self):
16441644 epoch_end = np .hstack ([breaks + 1 , [self .num_ancestors ]])
16451645 time_slices = np .vstack ([epoch_start , epoch_end ]).T
16461646 epoch_sizes = time_slices [:, 1 ] - time_slices [:, 0 ]
1647+ # Find the epoch where the sum of ancestors has reached 1M as a cutoff
1648+ if np .sum (epoch_sizes ) <= 1e6 :
1649+ over_1M_epoch = len (time_slices )
1650+ over_1M_epoch_first_ancestor = self .num_ancestors
1651+ else :
1652+ over_1M_epoch = np .where (np .cumsum (epoch_sizes ) > 1e6 )[0 ][0 ]
1653+ over_1M_epoch_first_ancestor = time_slices [over_1M_epoch , 0 ]
1654+ logger .info (
1655+ f"1M ancestors reached at { over_1M_epoch } epoch and ancestor "
1656+ f"{ over_1M_epoch_first_ancestor } "
1657+ )
1658+
1659+ # Find the first epoch with more than a 500 times the median epoch size
16471660 median_size = np .median (epoch_sizes )
16481661 cutoff = 500 * median_size
16491662 # Zero out the first half so that an initial large epoch doesn't
@@ -1653,13 +1666,17 @@ def group_by_linesweep(self):
16531666 # the median epoch size. For a large set of human genomes the median epoch
16541667 # size is around 10, so we'll stop grouping by linesweep at 5000.
16551668 if np .max (epoch_sizes ) <= cutoff :
1656- large_epoch = len (time_slices )
1657- large_epoch_first_ancestor = self .num_ancestors
1669+ large_epoch = over_1M_epoch
1670+ large_epoch_first_ancestor = over_1M_epoch_first_ancestor
1671+ logger .info ("No large epochs found, using count cutoff" )
16581672 else :
16591673 large_epoch = np .where (epoch_sizes > cutoff )[0 ][0 ]
16601674 large_epoch_first_ancestor = time_slices [large_epoch , 0 ]
1675+ logger .info (
1676+ f"Large epoch found at { large_epoch } with { epoch_sizes [large_epoch ]} "
1677+ f"ancestors and ancestor { large_epoch_first_ancestor } "
1678+ )
16611679 logger .info (f"{ len (time_slices )} epochs with { median_size } median size." )
1662- logger .info (f"First large (>{ cutoff } ) epoch is { large_epoch } " )
16631680 logger .info (f"Grouping { large_epoch_first_ancestor } ancestors by linesweep" )
16641681 ancestor_grouping = ancestors .group_ancestors_by_linesweep (
16651682 start [:large_epoch_first_ancestor ],
@@ -1672,6 +1689,11 @@ def group_by_linesweep(self):
16721689 ancestor_grouping [next_epoch ] = np .arange (* time_slices [epoch ])
16731690 next_epoch += 1
16741691
1692+ # Assert that every ancestor appears once in ancestor grouping
1693+ assert (
1694+ len (set (np .hstack (list (ancestor_grouping .values ())))) == self .num_ancestors
1695+ )
1696+
16751697 # Remove the "virtual root" ancestor
16761698 try :
16771699 assert 0 in ancestor_grouping [0 ]
0 commit comments