From ce926cb4e3f4f7aa354ae09184f6a0245636dc5e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 23 Dec 2022 14:36:21 -0800 Subject: [PATCH 01/55] Bump version to 0.13.0-alpha.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2544018b3..e647f277a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tskit" -version = "0.12.0" +version = "0.13.0-alpha.0" authors = ["tskit developers "] build = "build.rs" edition = "2021" From 09d81c44d50c9e80e0e399cf1aacc9da805e23b5 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 11 Jan 2023 13:48:31 -0800 Subject: [PATCH 02/55] feat: Use Bookmark in haploid_wright_fisher example. (#441) --- examples/haploid_wright_fisher.rs | 35 ++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/examples/haploid_wright_fisher.rs b/examples/haploid_wright_fisher.rs index 1d10b7fa8..27053a20d 100644 --- a/examples/haploid_wright_fisher.rs +++ b/examples/haploid_wright_fisher.rs @@ -8,12 +8,30 @@ use proptest::prelude::*; use rand::distributions::Distribution; use rand::SeedableRng; +fn rotate_edges(bookmark: &tskit::types::Bookmark, tables: &mut tskit::TableCollection) { + let num_edges = tables.edges().num_rows().as_usize(); + let left = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.left, num_edges) }; + let right = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.right, num_edges) }; + let parent = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.parent, num_edges) }; + let child = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.child, num_edges) }; + let mid = bookmark.edges().as_usize(); + left.rotate_left(mid); + right.rotate_left(mid); + parent.rotate_left(mid); + child.rotate_left(mid); +} + // ANCHOR: haploid_wright_fisher fn simulate( seed: u64, popsize: usize, num_generations: i32, simplify_interval: i32, + update_bookmark: bool, ) -> Result { if popsize == 0 { return Err(anyhow::Error::msg("popsize must be > 0")); @@ -46,6 +64,7 @@ fn simulate( let parent_picker = rand::distributions::Uniform::new(0, popsize); let breakpoint_generator = rand::distributions::Uniform::new(0.0, 1.0); let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let mut bookmark = tskit::types::Bookmark::new(); for birth_time in (0..num_generations).rev() { for c in children.iter_mut() { @@ -64,7 +83,10 @@ fn simulate( } if birth_time % simplify_interval == 0 { - tables.full_sort(tskit::TableSortOptions::default())?; + tables.sort(&bookmark, tskit::TableSortOptions::default())?; + if update_bookmark { + rotate_edges(&bookmark, &mut tables); + } if let Some(idmap) = tables.simplify(children, tskit::SimplificationOptions::default(), true)? { @@ -73,6 +95,9 @@ fn simulate( *o = idmap[usize::try_from(*o)?]; } } + if update_bookmark { + bookmark.set_edges(tables.edges().num_rows()); + } } std::mem::swap(&mut parents, &mut children); } @@ -91,6 +116,8 @@ struct SimParams { num_generations: i32, simplify_interval: i32, treefile: Option, + #[clap(short, long, help = "Use bookmark to avoid sorting entire edge table.")] + bookmark: bool, } fn main() -> Result<()> { @@ -100,6 +127,7 @@ fn main() -> Result<()> { params.popsize, params.num_generations, params.simplify_interval, + params.bookmark, )?; if let Some(treefile) = ¶ms.treefile { @@ -114,8 +142,9 @@ proptest! { #[test] fn test_simulate_proptest(seed in any::(), num_generations in 50..100i32, - simplify_interval in 1..100i32) { - let ts = simulate(seed, 100, num_generations, simplify_interval).unwrap(); + simplify_interval in 1..100i32, + bookmark in proptest::bool::ANY) { + let ts = simulate(seed, 100, num_generations, simplify_interval, bookmark).unwrap(); // stress test the branch length fn b/c it is not a trivial // wrapper around the C API. From a506523e6743e0d5f6ac6fbe3bae84ecb2e58091 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 23 Dec 2022 15:25:24 -0800 Subject: [PATCH 03/55] feat: Implement efficient edge buffering * cargo feature edgebuffer --- Cargo.toml | 5 + .../haploid_wright_fisher_edge_buffering.rs | 144 +++++ src/edgebuffer.rs | 499 ++++++++++++++++++ src/lib.rs | 7 + tests/test_edge_buffer.rs | 80 +++ 5 files changed, 735 insertions(+) create mode 100644 examples/haploid_wright_fisher_edge_buffering.rs create mode 100644 src/edgebuffer.rs create mode 100644 tests/test_edge_buffer.rs diff --git a/Cargo.toml b/Cargo.toml index e647f277a..66be29a09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ pkg-config = "0.3" [features] provenance = ["humantime"] derive = ["tskit-derive", "serde", "serde_json", "bincode"] +edgebuffer = [] [package.metadata.docs.rs] all-features = true @@ -58,3 +59,7 @@ rustdoc-args = ["--cfg", "doc_cfg"] # Not run during tests [[example]] name = "tree_traversals" + +[[example]] +name = "haploid_wright_fisher_edge_buffering" +required-features = ["edgebuffer"] diff --git a/examples/haploid_wright_fisher_edge_buffering.rs b/examples/haploid_wright_fisher_edge_buffering.rs new file mode 100644 index 000000000..8eb2ffb0a --- /dev/null +++ b/examples/haploid_wright_fisher_edge_buffering.rs @@ -0,0 +1,144 @@ +// This is a rust implementation of the example +// found in tskit-c + +use anyhow::Result; +use clap::Parser; +#[cfg(test)] +use proptest::prelude::*; +use rand::distributions::Distribution; +use rand::SeedableRng; + +// ANCHOR: haploid_wright_fisher_edge_buffering +fn simulate( + seed: u64, + popsize: usize, + num_generations: i32, + simplify_interval: i32, +) -> Result { + if popsize == 0 { + return Err(anyhow::Error::msg("popsize must be > 0")); + } + if num_generations == 0 { + return Err(anyhow::Error::msg("num_generations must be > 0")); + } + if simplify_interval == 0 { + return Err(anyhow::Error::msg("simplify_interval must be > 0")); + } + let mut tables = tskit::TableCollection::new(1.0)?; + + // create parental nodes + let mut parents_and_children = { + let mut temp = vec![]; + let parental_time = f64::from(num_generations); + for _ in 0..popsize { + let node = tables.add_node(0, parental_time, -1, -1)?; + temp.push(node); + } + temp + }; + + // allocate space for offspring nodes + parents_and_children.resize(2 * parents_and_children.len(), tskit::NodeId::NULL); + + // Construct non-overlapping mutable slices into our vector. + let (mut parents, mut children) = parents_and_children.split_at_mut(popsize); + + let parent_picker = rand::distributions::Uniform::new(0, popsize); + let breakpoint_generator = rand::distributions::Uniform::new(0.0, 1.0); + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let mut buffer = tskit::EdgeBuffer::default(); + + for birth_time in (0..num_generations).rev() { + for c in children.iter_mut() { + let bt = f64::from(birth_time); + let child = tables.add_node(0, bt, -1, -1)?; + let left_parent = parents + .get(parent_picker.sample(&mut rng)) + .ok_or_else(|| anyhow::Error::msg("invalid left_parent index"))?; + let right_parent = parents + .get(parent_picker.sample(&mut rng)) + .ok_or_else(|| anyhow::Error::msg("invalid right_parent index"))?; + buffer.setup_births(&[*left_parent, *right_parent], &[child])?; + let breakpoint = breakpoint_generator.sample(&mut rng); + buffer.record_birth(*left_parent, child, 0., breakpoint)?; + buffer.record_birth(*right_parent, child, breakpoint, 1.0)?; + buffer.finalize_births(); + *c = child; + } + + if birth_time % simplify_interval == 0 { + buffer.pre_simplification(&mut tables)?; + //tables.full_sort(tskit::TableSortOptions::default())?; + if let Some(idmap) = + tables.simplify(children, tskit::SimplificationOptions::default(), true)? + { + // remap child nodes + for o in children.iter_mut() { + *o = idmap[usize::try_from(*o)?]; + } + } + buffer.post_simplification(children, &mut tables)?; + } + std::mem::swap(&mut parents, &mut children); + } + + tables.build_index()?; + let treeseq = tables.tree_sequence(tskit::TreeSequenceFlags::default())?; + + Ok(treeseq) +} +// ANCHOR_END: haploid_wright_fisher_edge_buffering + +#[derive(Clone, clap::Parser)] +struct SimParams { + seed: u64, + popsize: usize, + num_generations: i32, + simplify_interval: i32, + treefile: Option, +} + +fn main() -> Result<()> { + let params = SimParams::parse(); + let treeseq = simulate( + params.seed, + params.popsize, + params.num_generations, + params.simplify_interval, + )?; + + if let Some(treefile) = ¶ms.treefile { + treeseq.dump(treefile, 0)?; + } + + Ok(()) +} + +#[cfg(test)] +proptest! { +#[test] + fn test_simulate_proptest(seed in any::(), + num_generations in 50..100i32, + simplify_interval in 1..100i32) { + let ts = simulate(seed, 100, num_generations, simplify_interval).unwrap(); + + // stress test the branch length fn b/c it is not a trivial + // wrapper around the C API. + { + use streaming_iterator::StreamingIterator; + let mut x = f64::NAN; + if let Ok(mut tree_iter) = ts.tree_iterator(0) { + // We will only do the first tree to save time. + if let Some(tree) = tree_iter.next() { + let b = tree.total_branch_length(false).unwrap(); + let b2 = unsafe { + tskit::bindings::tsk_tree_get_total_branch_length(tree.as_ptr(), -1, &mut x) + }; + assert!(b2 >= 0, "{}", b2); + assert!(f64::from(b) - x <= 1e-8); + } + } + } + } +} + diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs new file mode 100644 index 000000000..8cbce6e06 --- /dev/null +++ b/src/edgebuffer.rs @@ -0,0 +1,499 @@ +use crate::NodeId; +use crate::Position; +use crate::TableCollection; +use crate::TskitError; + +// Design considerations: +// +// We should be able to do better than +// the fwdpp implementation by taking a +// time-sorted list of alive nodes and inserting +// their edges. +// After insertion, we can truncate the input +// edge table, eliminating all edges corresponding +// to the set of alive nodes. +// This procedure would only be done AFTER +// simplification, such that the copied +// edges are guaranteed correct. +// We'd need to hash the existence of these alive nodes. +// Then, when going over the edge buffer, we can ask +// if an edge parent is in the hashed set. +// We would also keep track of the smallest +// edge id, and that (maybe minus 1?) is our truncation point. + +fn swap_with_empty(vec: &mut Vec) { + let mut t = vec![]; + std::mem::swap(&mut t, vec); +} + +#[derive(Copy, Clone)] +struct AliveNodeTimes { + min: f64, + max: f64, +} + +impl AliveNodeTimes { + fn new(min: f64, max: f64) -> Self { + Self { min, max } + } + + fn non_overlapping(&self) -> bool { + self.min == self.max + } +} + +#[derive(Debug)] +struct PreExistingEdge { + first: usize, + last: usize, +} + +impl PreExistingEdge { + fn new(first: usize, last: usize) -> Self { + assert!(last > first); + Self { first, last } + } +} + +#[derive(Debug)] +struct Segment { + left: Position, + right: Position, +} + +type ChildSegments = std::collections::HashMap>; + +#[derive(Default, Debug)] +struct BufferedBirths { + children: Vec, + segments: std::collections::HashMap, +} + +impl BufferedBirths { + fn initialize(&mut self, parents: &[NodeId], children: &[NodeId]) -> Result<(), TskitError> { + self.children = children.to_vec(); + self.children.sort(); + self.segments.clear(); + // FIXME: don't do this work if the parent already exists + for p in parents { + let mut segments = ChildSegments::default(); + for c in &self.children { + if segments.insert(*c, vec![]).is_some() { + return Err(TskitError::LibraryError("redundant child ids".to_owned())); + } + } + self.segments.insert(*p, segments); + } + Ok(()) + } +} + +#[derive(Default, Debug)] +pub struct EdgeBuffer { + left: Vec, + right: Vec, + child: Vec, + // TODO: this should be + // an option so that we can use take. + buffered_births: BufferedBirths, + // NOTE: these vectors are wasteful: + // 1. usize is more than we need, + // but it is more convenient. + // 2. Worse, these vectors will + // contain N elements, where + // N is the total number of nodes, + // but likely many fewer nodes than that + // have actually had offspring. + // It is hard to fix this -- we cannot + // guarantee that parents are entered + // in any specific order. + head: Vec, + tail: Vec, + next: Vec, +} + +impl EdgeBuffer { + fn insert_new_parent(&mut self, parent: usize, child: NodeId, left: Position, right: Position) { + self.left.push(left); + self.right.push(right); + self.child.push(child); + self.head[parent] = self.left.len() - 1; + self.tail[parent] = self.head[parent]; + self.next.push(usize::MAX); + } + + fn extend_parent(&mut self, parent: usize, child: NodeId, left: Position, right: Position) { + self.left.push(left); + self.right.push(right); + self.child.push(child); + let t = self.tail[parent]; + self.tail[parent] = self.left.len() - 1; + self.next[t] = self.left.len() - 1; + self.next.push(usize::MAX); + } + + fn clear(&mut self) { + self.left.clear(); + self.right.clear(); + self.child.clear(); + self.head.clear(); + self.tail.clear(); + self.next.clear(); + } + + fn release_memory(&mut self) { + swap_with_empty(&mut self.head); + swap_with_empty(&mut self.next); + swap_with_empty(&mut self.left); + swap_with_empty(&mut self.right); + swap_with_empty(&mut self.child); + swap_with_empty(&mut self.tail); + } + + fn extract_buffered_births(&mut self) -> BufferedBirths { + let mut b = BufferedBirths::default(); + std::mem::swap(&mut self.buffered_births, &mut b); + b + } + + // Should Err if prents/children not unique + pub fn setup_births( + &mut self, + parents: &[NodeId], + children: &[NodeId], + ) -> Result<(), TskitError> { + self.buffered_births.initialize(parents, children) + } + + pub fn finalize_births(&mut self) { + let buffered_births = self.extract_buffered_births(); + for (p, children) in buffered_births.segments.iter() { + for c in buffered_births.children.iter() { + if let Some(segs) = children.get(c) { + for s in segs { + self.buffer_birth(*p, *c, s.left, s.right).unwrap(); + } + } else { + // should be error + panic!(); + } + } + } + } + + pub fn record_birth( + &mut self, + parent: P, + child: C, + left: L, + right: R, + ) -> Result<(), TskitError> + where + P: Into, + C: Into, + L: Into, + R: Into, + { + let parent = parent.into(); + + let child = child.into(); + if let Some(parent_buffer) = self.buffered_births.segments.get_mut(&parent) { + if let Some(v) = parent_buffer.get_mut(&child) { + let left = left.into(); + let right = right.into(); + v.push(Segment { left, right }); + } else { + // should be an error + panic!(); + } + } else { + // should be an error + panic!(); + } + + Ok(()) + } + + // NOTE: tskit is overly strict during simplification, + // enforcing sorting requirements on the edge table + // that are not strictly necessary. + fn buffer_birth( + &mut self, + parent: P, + child: C, + left: L, + right: R, + ) -> Result<(), TskitError> + where + P: Into, + C: Into, + L: Into, + R: Into, + { + let parent = parent.into(); + if parent < 0 { + return Err(TskitError::IndexError); + } + + let parent = parent.as_usize(); + + if parent >= self.head.len() { + self.head.resize(parent + 1, usize::MAX); + self.tail.resize(parent + 1, usize::MAX); + } + + if self.head[parent] == usize::MAX { + self.insert_new_parent(parent, child.into(), left.into(), right.into()); + } else { + self.extend_parent(parent, child.into(), left.into(), right.into()); + } + Ok(()) + } + + // NOTE: we can probably have this function not error: + // the head array is populated by i32 converted to usize, + // so if things are getting out of range, we should be + // in trouble before this point. + // NOTE: we need a bitflags here for other options, like sorting the head + // contents based on birth time. + pub fn pre_simplification(&mut self, tables: &mut TableCollection) -> Result<(), TskitError> { + let num_input_edges = tables.edges().num_rows().as_usize(); + let mut head_index: Vec = self + .head + .iter() + .enumerate() + .filter(|(_, j)| **j != usize::MAX) + .map(|(i, _)| i) + .collect(); + + let node_time = tables.nodes().time_slice(); + head_index.sort_by(|a, b| node_time[*a].partial_cmp(&node_time[*b]).unwrap()); + //for (i, h) in self.head.iter().rev().enumerate() { + for h in head_index.into_iter() { + let parent = match i32::try_from(h) { + Ok(value) => value, + Err(_) => { + return Err(TskitError::RangeError( + "usize to i32 conversion failed".to_owned(), + )) + } + }; + tables.add_edge( + self.left[self.head[h]], + self.right[self.head[h]], + parent, + self.child[self.head[h]], + )?; + + let mut next = self.next[self.head[h]]; + while next != usize::MAX { + tables.add_edge(self.left[next], self.right[next], parent, self.child[next])?; + next = self.next[next]; + } + } + + self.release_memory(); + + // This assert is redundant b/c TableCollection + // works via MBox/NonNull. + assert!(!tables.as_ptr().is_null()); + // SAFETY: table collection pointer is not null and num_edges + // is the right length. + let num_edges = tables.edges().num_rows().as_usize(); + let edge_left = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.left, num_edges) }; + let edge_right = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.right, num_edges) + }; + let edge_parent = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.parent, num_edges) + }; + let edge_child = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.child, num_edges) + }; + edge_left.rotate_left(num_input_edges); + edge_right.rotate_left(num_input_edges); + edge_parent.rotate_left(num_input_edges); + edge_child.rotate_left(num_input_edges); + Ok(()) + } + + fn alive_node_times(&self, alive: &[NodeId], tables: &mut TableCollection) -> AliveNodeTimes { + let node_times = tables.nodes().time_slice_raw(); + let mut max_alive_node_time = 0.0; + let mut min_alive_node_time = f64::MAX; + + for a in alive { + let time = node_times[a.as_usize()]; + max_alive_node_time = if time > max_alive_node_time { + time + } else { + max_alive_node_time + }; + min_alive_node_time = if time < min_alive_node_time { + time + } else { + min_alive_node_time + }; + } + AliveNodeTimes::new(min_alive_node_time, max_alive_node_time) + } + + // The method here ends up creating a problem: + // we are buffering nodes with increasing node id + // that are also more ancient. This is the opposite + // order from what happens during a forward-time simulation. + fn buffer_existing_edges( + &mut self, + pre_existing_edges: Vec, + tables: &mut TableCollection, + ) -> Result { + let parent = tables.edges().parent_slice(); + let child = tables.edges().child_slice(); + let left = tables.edges().left_slice(); + let right = tables.edges().right_slice(); + let mut rv = 0; + for pre in pre_existing_edges.iter() { + self.setup_births(&[parent[pre.first]], &child[pre.first..pre.last])?; + for e in pre.first..pre.last { + assert_eq!(parent[e], parent[pre.first]); + self.record_birth(parent[e], child[e], left[e], right[e])?; + rv += 1; + } + self.finalize_births(); + } + + Ok(rv) + } + + fn collect_pre_existing_edges( + &self, + alive_node_times: AliveNodeTimes, + tables: &mut TableCollection, + ) -> Vec { + let mut edges = vec![]; + let mut i = 0; + let parent = tables.edges().parent_slice(); + let child = tables.edges().child_slice(); + let node_time = tables.nodes().time_slice(); + while i < parent.len() { + let p = parent[i]; + let c = child[i]; + if node_time[p.as_usize()] <= alive_node_times.max + || (node_time[c.as_usize()] < alive_node_times.max + && node_time[p.as_usize()] > alive_node_times.max) + { + let mut j = 0_usize; + while i + j < parent.len() && parent[i + j] == p { + j += 1; + } + edges.push(PreExistingEdge::new(i, i + j)); + i += j; + } else { + break; + } + } + edges + } + + // FIXME: + // + // 1. If min/max parent alive times are equal, return. + // DONE + // 2. Else, we need to do a rotation at min_edge + // before truncation. + // DONE + // 3. However, we also have to respect our API + // and process each parent carefully, + // setting up the birth/death epochs. + // We need to use setup_births and finalize_births + // to get this right. + // DONE + // 4. We are doing this in the wrong temporal order. + // We need to pre-process all existing edge intervals, + // cache them, then go backwards through them, + // so that we buffer them present-to-past + pub fn post_simplification( + &mut self, + alive: &[NodeId], + tables: &mut TableCollection, + ) -> Result<(), TskitError> { + self.clear(); + + let alive_node_times = self.alive_node_times(alive, tables); + if alive_node_times.non_overlapping() { + // There can be no overlap between current + // edges and births that are about to happen, + // so we get out. + return Ok(()); + } + + let pre_existing_edges = self.collect_pre_existing_edges(alive_node_times, tables); + let min_edge = self.buffer_existing_edges(pre_existing_edges, tables)?; + let num_edges = tables.edges().num_rows().as_usize(); + let edge_left = + unsafe { std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.left, num_edges) }; + let edge_right = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.right, num_edges) + }; + let edge_parent = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.parent, num_edges) + }; + let edge_child = unsafe { + std::slice::from_raw_parts_mut((*tables.as_mut_ptr()).edges.child, num_edges) + }; + edge_left.rotate_left(min_edge); + edge_right.rotate_left(min_edge); + edge_parent.rotate_left(min_edge); + edge_child.rotate_left(min_edge); + // SAFETY: ????? + let rv = unsafe { + crate::bindings::tsk_edge_table_truncate( + &mut (*tables.as_mut_ptr()).edges, + (num_edges - min_edge) as crate::bindings::tsk_size_t, + ) + }; + handle_tsk_return_value!(rv, ()) + } +} + +#[test] +fn test_pre_simplification() { + let mut tables = TableCollection::new(10.).unwrap(); + let mut buffer = EdgeBuffer::default(); + let p0 = tables.add_node(0, 1.0, -1, -1).unwrap(); + let p1 = tables.add_node(0, 1.0, -1, -1).unwrap(); + let c0 = tables.add_node(0, 0.0, -1, -1).unwrap(); + let c1 = tables.add_node(0, 0.0, -1, -1).unwrap(); + buffer.setup_births(&[p0, p1], &[c0, c1]).unwrap(); + + // Record data in a way that intentionally + // breaks what tskit wants: + // * children are not sorted in increading order + // of id. + buffer.record_birth(0, 3, 5.0, 10.0).unwrap(); + buffer.record_birth(0, 2, 0.0, 5.0).unwrap(); + buffer.record_birth(1, 3, 0.0, 5.0).unwrap(); + buffer.record_birth(1, 2, 5.0, 10.0).unwrap(); + buffer.finalize_births(); + buffer.pre_simplification(&mut tables).unwrap(); + assert_eq!(tables.edges().num_rows(), 4); + tables.simplify(&[2, 3], 0, false).unwrap(); + assert_eq!(tables.edges().num_rows(), 0); +} + +#[test] +fn test_post_simplification() { + let mut tables = TableCollection::new(10.).unwrap(); + let p0 = tables.add_node(0, 1.0, -1, -1).unwrap(); + let p1 = tables.add_node(0, 1.0, -1, -1).unwrap(); + let c0 = tables.add_node(0, 0.0, -1, -1).unwrap(); + let c1 = tables.add_node(0, 0.0, -1, -1).unwrap(); + let _e0 = tables.add_edge(0.0, 10.0, p0, c0).unwrap(); + let _e1 = tables.add_edge(0.0, 10.0, p1, c1).unwrap(); + assert_eq!(tables.edges().num_rows(), 2); + let alive = vec![c0, c1]; // the children have replaced the parents + let mut buffer = EdgeBuffer::default(); + buffer.post_simplification(&alive, &mut tables).unwrap(); + assert_eq!(tables.edges().num_rows(), 2); +} diff --git a/src/lib.rs b/src/lib.rs index 81e5b1228..31ae77573 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -140,6 +140,13 @@ pub use trees::{Tree, TreeSequence}; #[cfg_attr(doc_cfg, doc(cfg(feature = "provenance")))] pub mod provenance; +#[cfg(feature = "edgebuffer")] +mod edgebuffer; + +#[cfg(feature = "edgebuffer")] +#[cfg_attr(doc_cfg, doc(cfg(feature = "edgebuffer")))] +pub use edgebuffer::EdgeBuffer; + /// Handles return codes from low-level tskit functions. /// /// When an error from the tskit C API is detected, diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs new file mode 100644 index 000000000..249dfd290 --- /dev/null +++ b/tests/test_edge_buffer.rs @@ -0,0 +1,80 @@ +#![cfg(feature = "edgebuffer")] + +use proptest::prelude::*; +use rand::distributions::Distribution; +use rand::SeedableRng; + +use tskit::EdgeBuffer; +use tskit::TableCollection; +use tskit::TreeSequence; + +fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequence { + let mut tables = TableCollection::new(1.0).unwrap(); + let mut buffer = EdgeBuffer::default(); + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let popsize = 10; + + let mut parents = vec![]; + + for _ in 0..popsize { + let node = tables.add_node(0, 100.0, -1, -1).unwrap(); + parents.push(node); + } + + let death = rand::distributions::Uniform::new(0., 1.0); + let parent_picker = rand::distributions::Uniform::new(0, popsize); + + for birth_time in (0..10).rev() { + let mut replacements = vec![]; + for i in 0..parents.len() { + if death.sample(&mut rng) <= pdeath { + replacements.push(i); + } + } + let mut births = vec![]; + + for _ in 0..replacements.len() { + let parent_index = parent_picker.sample(&mut rng); + let parent = parents[parent_index]; + let child = tables.add_node(0, birth_time as f64, -1, -1).unwrap(); + births.push(child); + buffer.setup_births(&[parent], &[child]).unwrap(); + buffer.record_birth(parent, child, 0., 1.).unwrap(); + buffer.finalize_births(); + } + + for (r, b) in replacements.iter().zip(births.iter()) { + assert!(*r < parents.len()); + parents[*r] = *b; + } + if birth_time % simplify == 0 { + buffer.pre_simplification(&mut tables).unwrap(); + //tables.full_sort(tskit::TableSortOptions::default()).unwrap(); + if let Some(idmap) = tables + .simplify(&parents, tskit::SimplificationOptions::default(), true) + .unwrap() + { + // remap child nodes + for o in parents.iter_mut() { + *o = idmap[usize::try_from(*o).unwrap()]; + } + } + buffer.post_simplification(&parents, &mut tables).unwrap(); + } + } + + tables.build_index().unwrap(); + + tables.tree_sequence(0.into()).unwrap() +} + +#[cfg(test)] +proptest! { + #[test] + fn test_edge_buffer_overlapping_generations(seed in any::(), + pdeath in 0.05..1.0, + simplify_interval in 1..100i32) { + let _ = overlapping_generations(seed, pdeath, simplify_interval); + } +} From 28eab9b544e1e7f970c309736412758be0960fb9 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 11 Jan 2023 08:46:25 -0800 Subject: [PATCH 04/55] commenting out code == tests still pass --- src/edgebuffer.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 8cbce6e06..bf35ca878 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -366,6 +366,8 @@ impl EdgeBuffer { Ok(rv) } + // FIXME: clean up commented-out code + // if we decide we don't need it. fn collect_pre_existing_edges( &self, alive_node_times: AliveNodeTimes, @@ -374,14 +376,14 @@ impl EdgeBuffer { let mut edges = vec![]; let mut i = 0; let parent = tables.edges().parent_slice(); - let child = tables.edges().child_slice(); + //let child = tables.edges().child_slice(); let node_time = tables.nodes().time_slice(); while i < parent.len() { let p = parent[i]; - let c = child[i]; + // let c = child[i]; if node_time[p.as_usize()] <= alive_node_times.max - || (node_time[c.as_usize()] < alive_node_times.max - && node_time[p.as_usize()] > alive_node_times.max) + //|| (node_time[c.as_usize()] < alive_node_times.max + // && node_time[p.as_usize()] > alive_node_times.max) { let mut j = 0_usize; while i + j < parent.len() && parent[i + j] == p { From 4a1d1210963382cb48e5d51e79b2d3089ae7a585 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 12 Jan 2023 12:52:59 -0800 Subject: [PATCH 05/55] note re: perf --- src/edgebuffer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index bf35ca878..7abef80e6 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -107,6 +107,9 @@ pub struct EdgeBuffer { // It is hard to fix this -- we cannot // guarantee that parents are entered // in any specific order. + // 3. Performance IMPROVES MEASURABLY + // if we use u32 here. But tsk_size_t + // is u64. head: Vec, tail: Vec, next: Vec, From c28107723f5239abaf050f1a9b42e7f493166d77 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 12:10:07 -0800 Subject: [PATCH 06/55] Add prototypes for stuff we will need. --- subprojects/tskit/tskit/tables.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index bab354622..28e0754ab 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -670,6 +670,25 @@ typedef struct { bool store_pairs; } tsk_identity_segments_t; +// KRT's latest insanity + +struct tsk_streaming_simplifier_impl_t; + +typedef struct { + /* don't leak private types into public API */ + tsk_streaming_simplifier_impl_t * pimpl; +} tsk_streaming_simplifier_t; + +int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, + tsk_table_collection_t *tables, const tsk_id_t *samples, + tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map); +int tsk_streaming_simplifier_free(tsk_streaming_simplifier_impl_t * self); +// metadata... +int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, + double left, double right, tsk_id_t parent, tsk_id_t child); +int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self); +int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self); + /****************************************************************************/ /* Common function options */ /****************************************************************************/ From dbd018ff38ac6469be68d9179ae66c1f3b88cbf2 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 12:47:52 -0800 Subject: [PATCH 07/55] init the simplifier --- subprojects/tskit/tskit/tables.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index ee628d24f..82f79e83e 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -23,6 +23,7 @@ * SOFTWARE. */ +#include "tables.h" #include #include #include @@ -12870,3 +12871,27 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output out: return ret; } + +typedef struct { + simplifier_t simplifier, +} tsk_streaming_simplifier_impl_t; + +int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, + tsk_table_collection_t *tables, const tsk_id_t *samples, + tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map) { + int ret = 0; + self->pimpl = tsk_malloc(sizeof(tsk_streaming_simplifier_impl_t)); + if (self->pimpl == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_simplifier_init(&self->pimpl.simplifier, tables, samples, num_samples, options, node_map); + if (ret != 0) { + goto out; + } + +out: + return ret; +} + +// KRT's latest madness From 773f363aac0d75d27287f41cbec58a2eecbb9bd4 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 12:50:18 -0800 Subject: [PATCH 08/55] move comment --- subprojects/tskit/tskit/tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 82f79e83e..01e41847d 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12872,6 +12872,8 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output return ret; } +// KRT's latest madness + typedef struct { simplifier_t simplifier, } tsk_streaming_simplifier_impl_t; @@ -12893,5 +12895,3 @@ int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, out: return ret; } - -// KRT's latest madness From 0f3e47200b54fd1b53dab5cc8cd1e3e208dc0ba7 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 13:18:35 -0800 Subject: [PATCH 09/55] lots of C fixes re: compiling --- subprojects/tskit/tskit/tables.c | 48 +++++++++++++++++++++++++++++--- subprojects/tskit/tskit/tables.h | 13 ++++----- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 01e41847d..c30279cbd 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12875,19 +12875,19 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output // KRT's latest madness typedef struct { - simplifier_t simplifier, + simplifier_t simplifier; } tsk_streaming_simplifier_impl_t; -int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, +int tsk_streaming_simplifier_init(tsk_streaming_simplifier_t * self, tsk_table_collection_t *tables, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map) { + tsk_size_t num_samples, tsk_flags_t options) { int ret = 0; self->pimpl = tsk_malloc(sizeof(tsk_streaming_simplifier_impl_t)); if (self->pimpl == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - ret = tsk_simplifier_init(&self->pimpl.simplifier, tables, samples, num_samples, options, node_map); + ret = simplifier_init(&self->pimpl->simplifier, samples, num_samples, tables, options); if (ret != 0) { goto out; } @@ -12895,3 +12895,43 @@ int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, out: return ret; } + +// FIXME: parent not used +int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, + double left, double right, tsk_id_t parent, tsk_id_t child) { + int ret = simplifier_extract_ancestry(&self->pimpl->simplifier, left, right, child); + return ret; +} + +int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent) { + int ret = simplifier_merge_ancestors(&self->pimpl->simplifier, parent); + return ret; +} + +int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t * node_map) { + int ret = 0; + simplifier_t * simplifier = self->pimpl->simplifier; + ret = simplifier_output_sites(simplifier); + if (ret != 0) { + goto out; + } + ret = simplifier_finalise_references(simplifier); + if (ret != 0) { + goto out; + } + if (node_map != NULL) { + /* Finally, output the new IDs for the nodes, if required. */ + tsk_memcpy(node_map, simplifier->node_id_map, + simplifier->input_tables.nodes.num_rows * sizeof(tsk_id_t)); + } + if (simplifier->edge_sort_offset != TSK_NULL) { + tsk_bug_assert(simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS); + ret = simplifier_sort_edges(simplifier); + if (ret != 0) { + goto out; + } + } +out: + return ret; +} + diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index 28e0754ab..c4fdb9d48 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -671,23 +671,22 @@ typedef struct { } tsk_identity_segments_t; // KRT's latest insanity - struct tsk_streaming_simplifier_impl_t; typedef struct { /* don't leak private types into public API */ - tsk_streaming_simplifier_impl_t * pimpl; + struct tsk_streaming_simplifier_impl_t * pimpl; } tsk_streaming_simplifier_t; -int tsk_streaming_simplifier_init(tsk_streaming_simplifier_impl_t * self, +int tsk_streaming_simplifier_init(tsk_streaming_simplifier_t * self, tsk_table_collection_t *tables, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map); -int tsk_streaming_simplifier_free(tsk_streaming_simplifier_impl_t * self); + tsk_size_t num_samples, tsk_flags_t options); +int tsk_streaming_simplifier_free(tsk_streaming_simplifier_t * self); // metadata... int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, double left, double right, tsk_id_t parent, tsk_id_t child); -int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self); -int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self); +int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent); +int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t *node_map); /****************************************************************************/ /* Common function options */ From 09466650e08c2f3f8d590c0a5534963a72d852a8 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 13:34:23 -0800 Subject: [PATCH 10/55] now we build --- subprojects/tskit/tskit/tables.c | 4 ++-- subprojects/tskit/tskit/tables.h | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index c30279cbd..1df019e8d 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12874,7 +12874,7 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output // KRT's latest madness -typedef struct { +typedef struct __tsk_streaming_simplifier_impl_t { simplifier_t simplifier; } tsk_streaming_simplifier_impl_t; @@ -12910,7 +12910,7 @@ int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t * node_map) { int ret = 0; - simplifier_t * simplifier = self->pimpl->simplifier; + simplifier_t * simplifier = &self->pimpl->simplifier; ret = simplifier_output_sites(simplifier); if (ret != 0) { goto out; diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index c4fdb9d48..64384201f 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -671,11 +671,9 @@ typedef struct { } tsk_identity_segments_t; // KRT's latest insanity -struct tsk_streaming_simplifier_impl_t; - typedef struct { /* don't leak private types into public API */ - struct tsk_streaming_simplifier_impl_t * pimpl; + struct __tsk_streaming_simplifier_impl_t * pimpl; } tsk_streaming_simplifier_t; int tsk_streaming_simplifier_init(tsk_streaming_simplifier_t * self, From b4ef1b4c77277a887784d988656283925a1a5202 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 13:40:31 -0800 Subject: [PATCH 11/55] free --- subprojects/tskit/tskit/tables.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 1df019e8d..c69a1b4fc 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12935,3 +12935,14 @@ int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_ return ret; } +int tsk_streaming_simplifier_free(tsk_streaming_simplifier_t * self) { + int ret = 0; + ret = simplifier_free(&self->pimpl->simplifier); + if (ret != 0) { + goto out; + } + tsk_safe_free(self->pimpl); +out: + return ret; +} + From 09889cdb56aba03ed385e3d0d5e01a56860a9fa0 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 14:28:12 -0800 Subject: [PATCH 12/55] infrastructure --- src/edgebuffer.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 7abef80e6..335c7d42e 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -462,6 +462,83 @@ impl EdgeBuffer { } } +struct StreamingSimplifier { + simplifier: crate::bindings::tsk_streaming_simplifier_t, +} + +impl StreamingSimplifier { + fn new>( + samples: &[NodeId], + options: O, + tables: &mut TableCollection, + ) -> Result { + let mut simplifier = + std::mem::MaybeUninit::::uninit(); + let num_samples = samples.len() as crate::bindings::tsk_size_t; + match unsafe { + crate::bindings::tsk_streaming_simplifier_init( + simplifier.as_mut_ptr(), + tables.as_mut_ptr(), + samples.as_ptr().cast::(), + num_samples, + options.into().bits(), + ) + } { + code if code < 0 => Err(TskitError::ErrorCode { code }), + _ => Ok(Self { + simplifier: unsafe { simplifier.assume_init() }, + }), + } + } + + fn add_edge( + &mut self, + left: Position, + right: Position, + parent: NodeId, // FIXME: shouldn't be here + child: NodeId, + ) -> Result<(), TskitError> { + let code = unsafe { + crate::bindings::tsk_streaming_simplifier_add_edge( + &mut self.simplifier, + left.into(), + right.into(), + parent.into(), + child.into(), + ) + }; + handle_tsk_return_value!(code, ()) + } + + fn merge_ancestors(&mut self, parent: NodeId) -> Result<(), TskitError> { + let code = unsafe { + crate::bindings::tsk_streaming_simplifier_merge_ancestors( + &mut self.simplifier, + parent.into(), + ) + }; + handle_tsk_return_value!(code, ()) + } + + // FIXME: need to be able to validate that node_map is correct length! + fn finalise(&mut self, node_map: Option<&mut [NodeId]>) -> Result<(), TskitError> { + let n = match node_map { + Some(x) => x.as_mut_ptr().cast::(), + None => std::ptr::null_mut(), + }; + let code = + unsafe { crate::bindings::tsk_streaming_simplifier_finalise(&mut self.simplifier, n) }; + handle_tsk_return_value!(code, ()) + } +} + +impl Drop for StreamingSimplifier { + fn drop(&mut self) { + let code = unsafe { crate::bindings::tsk_streaming_simplifier_free(&mut self.simplifier) }; + assert_eq!(code, 0); + } +} + #[test] fn test_pre_simplification() { let mut tables = TableCollection::new(10.).unwrap(); From 500448cc658ae99303ca16dd6e372efa282f2e91 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 14:35:37 -0800 Subject: [PATCH 13/55] getting safe-ish wrapper in place. --- src/edgebuffer.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 335c7d42e..a6069e6b2 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -539,6 +539,14 @@ impl Drop for StreamingSimplifier { } } +pub fn simplfify_from_buffer( + samples: &[NodeId], + tables: &mut TableCollection, + buffer: &mut EdgeBuffer, + node_map: Option<&mut [NodeId]>, +) -> Result<(), TskitError> { +} + #[test] fn test_pre_simplification() { let mut tables = TableCollection::new(10.).unwrap(); From 0c48cb2a8de952d6be286f357bfd6c88523406c5 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 15:27:52 -0800 Subject: [PATCH 14/55] some details --- src/edgebuffer.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index a6069e6b2..fac7a3b12 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -539,12 +539,19 @@ impl Drop for StreamingSimplifier { } } -pub fn simplfify_from_buffer( +pub fn simplfify_from_buffer>( samples: &[NodeId], + options: O, tables: &mut TableCollection, buffer: &mut EdgeBuffer, node_map: Option<&mut [NodeId]>, ) -> Result<(), TskitError> { + let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; + for (i, h) in buffer.head.iter().rev().enumerate() { + unimplemented!() + } + + Ok(()) } #[test] From d429502bca0e812a61f21f28acd1576ed4dbcd43 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 13 Jan 2023 15:34:19 -0800 Subject: [PATCH 15/55] what needs to happen --- src/edgebuffer.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index fac7a3b12..fcfb1b851 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -539,6 +539,13 @@ impl Drop for StreamingSimplifier { } } +// TODO: +// 1. The edge buffer API is wrong here. +// We need to encapsulate the existing type, +// and make one whose public API does what we need. +// 2. If this works out, it measn we need to extract +// the core buffer ops out to a private type +// and make public newtypes using it. pub fn simplfify_from_buffer>( samples: &[NodeId], options: O, From db853359af45dbed6128c56509d120ba4c2169cf Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 08:01:28 -0800 Subject: [PATCH 16/55] draft impl for simplifying using streaming simplifier --- src/edgebuffer.rs | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index fcfb1b851..877c66ce9 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -554,10 +554,47 @@ pub fn simplfify_from_buffer>( node_map: Option<&mut [NodeId]>, ) -> Result<(), TskitError> { let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; + // Simplify the most recent births for (i, h) in buffer.head.iter().rev().enumerate() { - unimplemented!() + let parent = buffer.head.len() - i - 1; + assert_ne!(parent, usize::MAX); + simplifier.add_edge( + buffer.left[parent], + buffer.right[parent], + (parent as i32).into(), + buffer.child[parent], + )?; + let mut next = buffer.next[buffer.head[*h]]; + let parent = NodeId::from(parent as i32); + while next != usize::MAX { + simplifier.add_edge( + buffer.left[next], + buffer.right[next], + parent, + buffer.child[next], + )?; + next = buffer.next[next]; + } + simplifier.merge_ancestors(parent)?; + } + buffer.release_memory(); + + // Simplify pre-existing edges. + let left = tables.edges().left_slice(); + let right = tables.edges().right_slice(); + let parent = tables.edges().parent_slice(); + let child = tables.edges().child_slice(); + let mut i = 0; + while i < left.len() { + let p = parent[i]; + while i < left.len() && parent[i] == p { + simplifier.add_edge(left[i], right[i], parent[i], child[i])?; + i += 1; + } + simplifier.merge_ancestors(p)?; } + simplifier.finalise(node_map)?; Ok(()) } From 1c14bd9d7bf15bf4669d63f0a2847b54021b7a3a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 08:05:08 -0800 Subject: [PATCH 17/55] make new fn pub --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 31ae77573..6ac5bfb85 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -147,6 +147,10 @@ mod edgebuffer; #[cfg_attr(doc_cfg, doc(cfg(feature = "edgebuffer")))] pub use edgebuffer::EdgeBuffer; +#[cfg(feature = "edgebuffer")] +#[cfg_attr(doc_cfg, doc(cfg(feature = "edgebuffer")))] +pub use edgebuffer::simplfify_from_buffer; + /// Handles return codes from low-level tskit functions. /// /// When an error from the tskit C API is detected, From d55895e9c4550c749a8709f516f02ce192b76673 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 08:34:08 -0800 Subject: [PATCH 18/55] fix segfault. find safety issues that is a bit of a doozy. --- .../haploid_wright_fisher_edge_buffering.rs | 40 ++++++++----- src/edgebuffer.rs | 57 ++++++++++++------- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/examples/haploid_wright_fisher_edge_buffering.rs b/examples/haploid_wright_fisher_edge_buffering.rs index 8eb2ffb0a..f6e7236d2 100644 --- a/examples/haploid_wright_fisher_edge_buffering.rs +++ b/examples/haploid_wright_fisher_edge_buffering.rs @@ -7,6 +7,7 @@ use clap::Parser; use proptest::prelude::*; use rand::distributions::Distribution; use rand::SeedableRng; +use tskit::NodeId; // ANCHOR: haploid_wright_fisher_edge_buffering fn simulate( @@ -47,8 +48,10 @@ fn simulate( let breakpoint_generator = rand::distributions::Uniform::new(0.0, 1.0); let mut rng = rand::rngs::StdRng::seed_from_u64(seed); let mut buffer = tskit::EdgeBuffer::default(); + let mut node_map: Vec = vec![]; for birth_time in (0..num_generations).rev() { + println!("birth_time = {birth_time:}"); for c in children.iter_mut() { let bt = f64::from(birth_time); let child = tables.add_node(0, bt, -1, -1)?; @@ -58,25 +61,37 @@ fn simulate( let right_parent = parents .get(parent_picker.sample(&mut rng)) .ok_or_else(|| anyhow::Error::msg("invalid right_parent index"))?; - buffer.setup_births(&[*left_parent, *right_parent], &[child])?; + //buffer.setup_births(&[*left_parent, *right_parent], &[child])?; let breakpoint = breakpoint_generator.sample(&mut rng); - buffer.record_birth(*left_parent, child, 0., breakpoint)?; - buffer.record_birth(*right_parent, child, breakpoint, 1.0)?; - buffer.finalize_births(); + buffer.buffer_birth(*left_parent, child, 0., breakpoint)?; + buffer.buffer_birth(*right_parent, child, breakpoint, 1.0)?; + //buffer.finalize_births(); *c = child; } if birth_time % simplify_interval == 0 { - buffer.pre_simplification(&mut tables)?; + //buffer.pre_simplification(&mut tables)?; //tables.full_sort(tskit::TableSortOptions::default())?; - if let Some(idmap) = - tables.simplify(children, tskit::SimplificationOptions::default(), true)? - { - // remap child nodes - for o in children.iter_mut() { - *o = idmap[usize::try_from(*o)?]; - } + node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); + tskit::simplfify_from_buffer( + children, + tskit::SimplificationOptions::default(), + &mut tables, + &mut buffer, + Some(&mut node_map), + )?; + for o in children.iter_mut() { + assert!(o.as_usize() < node_map.len()); + *o = node_map[usize::try_from(*o)?]; } + //if let Some(idmap) = + // tables.simplify(children, tskit::SimplificationOptions::default(), true)? + //{ + // // remap child nodes + // for o in children.iter_mut() { + // *o = idmap[usize::try_from(*o)?]; + // } + //} buffer.post_simplification(children, &mut tables)?; } std::mem::swap(&mut parents, &mut children); @@ -141,4 +156,3 @@ proptest! { } } } - diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 877c66ce9..99d2e1a17 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -220,7 +220,7 @@ impl EdgeBuffer { // NOTE: tskit is overly strict during simplification, // enforcing sorting requirements on the edge table // that are not strictly necessary. - fn buffer_birth( + pub fn buffer_birth( &mut self, parent: P, child: C, @@ -546,6 +546,10 @@ impl Drop for StreamingSimplifier { // 2. If this works out, it measn we need to extract // the core buffer ops out to a private type // and make public newtypes using it. +// FIXME: this fails because simplification setup does +// funny business with the pointers?. +// FIXME: this function is unsafe b/c of how tskit-c +// messes w/pointers behind the scenes. pub fn simplfify_from_buffer>( samples: &[NodeId], options: O, @@ -553,39 +557,48 @@ pub fn simplfify_from_buffer>( buffer: &mut EdgeBuffer, node_map: Option<&mut [NodeId]>, ) -> Result<(), TskitError> { + let a = tables.edges().num_rows(); + + // have to take copies of the current members of + // the edge table. + let left = tables.edges().left_slice().to_vec(); + let right = tables.edges().right_slice().to_vec(); + let parent = tables.edges().parent_slice().to_vec(); + let child = tables.edges().child_slice().to_vec(); let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; + assert_eq!(a, tables.edges().num_rows()); // Simplify the most recent births for (i, h) in buffer.head.iter().rev().enumerate() { - let parent = buffer.head.len() - i - 1; - assert_ne!(parent, usize::MAX); - simplifier.add_edge( - buffer.left[parent], - buffer.right[parent], - (parent as i32).into(), - buffer.child[parent], - )?; - let mut next = buffer.next[buffer.head[*h]]; - let parent = NodeId::from(parent as i32); - while next != usize::MAX { + if *h != usize::MAX { + let parent = buffer.head.len() - i - 1; + assert_ne!(parent, usize::MAX); simplifier.add_edge( - buffer.left[next], - buffer.right[next], - parent, - buffer.child[next], + buffer.left[parent], + buffer.right[parent], + (parent as i32).into(), + buffer.child[parent], )?; - next = buffer.next[next]; + let mut next = buffer.next[*h]; + let parent = NodeId::from(parent as i32); + while next != usize::MAX { + println!("next={next:}, parent={parent:}"); + simplifier.add_edge( + buffer.left[next], + buffer.right[next], + parent, + buffer.child[next], + )?; + next = buffer.next[next]; + } + simplifier.merge_ancestors(parent)?; } - simplifier.merge_ancestors(parent)?; } buffer.release_memory(); // Simplify pre-existing edges. - let left = tables.edges().left_slice(); - let right = tables.edges().right_slice(); - let parent = tables.edges().parent_slice(); - let child = tables.edges().child_slice(); let mut i = 0; while i < left.len() { + println!("{i:} {}", left.len()); let p = parent[i]; while i < left.len() && parent[i] == p { simplifier.add_edge(left[i], right[i], parent[i], child[i])?; From 11bbd024b7aa53611d2aa56564e85c280226f451 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 10:05:21 -0800 Subject: [PATCH 19/55] fix finalise --- subprojects/tskit/tskit/tables.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index c69a1b4fc..3edee8190 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12911,6 +12911,12 @@ int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t * node_map) { int ret = 0; simplifier_t * simplifier = &self->pimpl->simplifier; + if (simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS) { + ret = simplifier_insert_input_roots(simplifier); + if (ret != 0) { + goto out; + } + } ret = simplifier_output_sites(simplifier); if (ret != 0) { goto out; From 4ed3977ee4c9a3e544452f4e018c186d19c7eb8b Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 10:30:09 -0800 Subject: [PATCH 20/55] confusion reigns right now --- .../haploid_wright_fisher_edge_buffering.rs | 4 + src/edgebuffer.rs | 89 ++++++++++++++++--- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/examples/haploid_wright_fisher_edge_buffering.rs b/examples/haploid_wright_fisher_edge_buffering.rs index f6e7236d2..1c09d8760 100644 --- a/examples/haploid_wright_fisher_edge_buffering.rs +++ b/examples/haploid_wright_fisher_edge_buffering.rs @@ -52,6 +52,7 @@ fn simulate( for birth_time in (0..num_generations).rev() { println!("birth_time = {birth_time:}"); + tables.check_integrity(tskit::TableIntegrityCheckFlags::CHECK_EDGE_ORDERING)?; for c in children.iter_mut() { let bt = f64::from(birth_time); let child = tables.add_node(0, bt, -1, -1)?; @@ -72,6 +73,7 @@ fn simulate( if birth_time % simplify_interval == 0 { //buffer.pre_simplification(&mut tables)?; //tables.full_sort(tskit::TableSortOptions::default())?; + println!("{parents:?}"); node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); tskit::simplfify_from_buffer( children, @@ -83,7 +85,9 @@ fn simulate( for o in children.iter_mut() { assert!(o.as_usize() < node_map.len()); *o = node_map[usize::try_from(*o)?]; + assert!(!o.is_null()); } + tables.check_integrity(tskit::TableIntegrityCheckFlags::CHECK_EDGE_ORDERING)?; //if let Some(idmap) = // tables.simplify(children, tskit::SimplificationOptions::default(), true)? //{ diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 99d2e1a17..aaa3cf66c 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -543,13 +543,12 @@ impl Drop for StreamingSimplifier { // 1. The edge buffer API is wrong here. // We need to encapsulate the existing type, // and make one whose public API does what we need. -// 2. If this works out, it measn we need to extract +// 2. If this works out, it means we need to extract // the core buffer ops out to a private type // and make public newtypes using it. -// FIXME: this fails because simplification setup does -// funny business with the pointers?. // FIXME: this function is unsafe b/c of how tskit-c // messes w/pointers behind the scenes. +// Solution is to take ownership of the tables? pub fn simplfify_from_buffer>( samples: &[NodeId], options: O, @@ -557,40 +556,78 @@ pub fn simplfify_from_buffer>( buffer: &mut EdgeBuffer, node_map: Option<&mut [NodeId]>, ) -> Result<(), TskitError> { - let a = tables.edges().num_rows(); - - // have to take copies of the current members of + // have to take copies of the current members of // the edge table. let left = tables.edges().left_slice().to_vec(); let right = tables.edges().right_slice().to_vec(); let parent = tables.edges().parent_slice().to_vec(); let child = tables.edges().child_slice().to_vec(); + let node_times = tables.nodes().time_slice().to_vec(); let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; - assert_eq!(a, tables.edges().num_rows()); + let mut last_parent_time = -1.0; // Simplify the most recent births for (i, h) in buffer.head.iter().rev().enumerate() { if *h != usize::MAX { let parent = buffer.head.len() - i - 1; + assert!(node_times[parent] >= last_parent_time); + last_parent_time = node_times[parent].into(); + println!("foo: {},{} | {} {}", i, parent, *h, buffer.head.len()); assert_ne!(parent, usize::MAX); + let mut edge_check: Vec<(NodeId, Position)> = vec![]; simplifier.add_edge( - buffer.left[parent], - buffer.right[parent], + buffer.left[*h], + buffer.right[*h], (parent as i32).into(), - buffer.child[parent], + buffer.child[*h], )?; + edge_check.push((buffer.child[*h], buffer.left[*h])); let mut next = buffer.next[*h]; + assert_ne!(next, *h); let parent = NodeId::from(parent as i32); while next != usize::MAX { println!("next={next:}, parent={parent:}"); + assert!(!edge_check + .iter() + .any(|x| *x == (buffer.child[next], buffer.left[next]))); simplifier.add_edge( buffer.left[next], buffer.right[next], parent, buffer.child[next], )?; + edge_check.push((buffer.child[next], buffer.left[next])); next = buffer.next[next]; } + println!("{edge_check:?}"); simplifier.merge_ancestors(parent)?; + + // major stress-test -- delete later + { + let l = tables.edges().left_slice(); + let p = tables.edges().parent_slice(); + let c = tables.edges().child_slice(); + let mut i = 0; + while i < l.len() { + let pi = p[i]; + while i < l.len() && p[i] == pi { + if i > 0 && c[i] == c[i - 1] { + assert_ne!( + l[i], + l[i - 1], + "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + p[i], + p[i - 1], + c[i], + c[i - 1], + l[i], + l[i - 1], + edge_check + ); + } + i += 1; + } + } + } } } buffer.release_memory(); @@ -598,13 +635,43 @@ pub fn simplfify_from_buffer>( // Simplify pre-existing edges. let mut i = 0; while i < left.len() { - println!("{i:} {}", left.len()); let p = parent[i]; + let mut edge_check: Vec<(NodeId, Position)> = vec![]; while i < left.len() && parent[i] == p { + assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); simplifier.add_edge(left[i], right[i], parent[i], child[i])?; + edge_check.push((child[i], left[i])); i += 1; } + println!("edge_check2: {edge_check:?}"); simplifier.merge_ancestors(p)?; + // major stress-test -- delete later + { + let l = tables.edges().left_slice(); + let p = tables.edges().parent_slice(); + let c = tables.edges().child_slice(); + let mut i = 0; + while i < l.len() { + let pi = p[i]; + while i < l.len() && p[i] == pi { + if i > 0 && c[i] == c[i - 1] { + assert_ne!( + l[i], + l[i - 1], + "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + p[i], + p[i - 1], + c[i], + c[i - 1], + l[i], + l[i - 1], + edge_check + ); + } + i += 1; + } + } + } } simplifier.finalise(node_map)?; From ceffa91a9b82387259125a23ad5042aaab36c644 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 10:37:08 -0800 Subject: [PATCH 21/55] more print debug --- src/edgebuffer.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index aaa3cf66c..8aa0ce60a 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -571,7 +571,14 @@ pub fn simplfify_from_buffer>( let parent = buffer.head.len() - i - 1; assert!(node_times[parent] >= last_parent_time); last_parent_time = node_times[parent].into(); - println!("foo: {},{} | {} {}", i, parent, *h, buffer.head.len()); + println!( + "foo: {},{} | {} {} | {}", + i, + parent, + *h, + buffer.head.len(), + last_parent_time + ); assert_ne!(parent, usize::MAX); let mut edge_check: Vec<(NodeId, Position)> = vec![]; simplifier.add_edge( From 4bc9a05d4cc4d58748b5685abe439025848c2cd2 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 10:41:16 -0800 Subject: [PATCH 22/55] hmmm --- src/edgebuffer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 8aa0ce60a..13c2b6ebb 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -591,6 +591,7 @@ pub fn simplfify_from_buffer>( let mut next = buffer.next[*h]; assert_ne!(next, *h); let parent = NodeId::from(parent as i32); + assert!(parent >= 0); while next != usize::MAX { println!("next={next:}, parent={parent:}"); assert!(!edge_check @@ -643,6 +644,8 @@ pub fn simplfify_from_buffer>( let mut i = 0; while i < left.len() { let p = parent[i]; + assert!(node_times[p.as_usize()] >= last_parent_time); + last_parent_time = node_times[p.as_usize()].into(); let mut edge_check: Vec<(NodeId, Position)> = vec![]; while i < left.len() && parent[i] == p { assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); From 1df577a22c34c109de6c9ec861c2cb8a861fce7a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 10:54:41 -0800 Subject: [PATCH 23/55] C-side stress --- subprojects/tskit/tskit/tables.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 3edee8190..9ba28c401 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12904,7 +12904,29 @@ int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, } int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent) { + tsk_id_t * p = self->pimpl->simplifier.tables->edges.parent; + tsk_id_t * c = self->pimpl->simplifier.tables->edges.child; + double * l = self->pimpl->simplifier.tables->edges.left; + tsk_size_t num_edges = self->pimpl->simplifier.tables->edges.num_rows, i=1; + while (i < num_edges) { + if (i>0 && p[i] == p[i-1]) { + if(c[i]==c[i-1] && l[i]==l[i-1]) { + fprintf(stdout, "bailing early\n"); + abort(); } + } + i+=1; + } int ret = simplifier_merge_ancestors(&self->pimpl->simplifier, parent); + num_edges = self->pimpl->simplifier.tables->edges.num_rows; + i=1; + while (i < num_edges) { + if (i>0 && p[i] == p[i-1]) { + if(c[i]==c[i-1] && l[i]==l[i-1]) { + fprintf(stdout, "bailing %d %d %lf\n", p[i], c[i], l[i]); + abort(); } + } + i+=1; + } return ret; } From 1665c8691624c8b4220f14cff73bfd2443a4056f Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:00:36 -0800 Subject: [PATCH 24/55] not erroring any more --- subprojects/tskit/tskit/tables.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 9ba28c401..8490112ad 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12927,6 +12927,7 @@ int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, } i+=1; } + self->pimpl->simplifier.segment_queue_size = 0; return ret; } From bb1eba47511a66b06aa592b2ac1c46aba9477ae7 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:18:55 -0800 Subject: [PATCH 25/55] clean out debug code --- .../haploid_wright_fisher_edge_buffering.rs | 4 - src/edgebuffer.rs | 127 ++++++++---------- subprojects/tskit/tskit/tables.c | 22 --- 3 files changed, 55 insertions(+), 98 deletions(-) diff --git a/examples/haploid_wright_fisher_edge_buffering.rs b/examples/haploid_wright_fisher_edge_buffering.rs index 1c09d8760..b718ae271 100644 --- a/examples/haploid_wright_fisher_edge_buffering.rs +++ b/examples/haploid_wright_fisher_edge_buffering.rs @@ -51,8 +51,6 @@ fn simulate( let mut node_map: Vec = vec![]; for birth_time in (0..num_generations).rev() { - println!("birth_time = {birth_time:}"); - tables.check_integrity(tskit::TableIntegrityCheckFlags::CHECK_EDGE_ORDERING)?; for c in children.iter_mut() { let bt = f64::from(birth_time); let child = tables.add_node(0, bt, -1, -1)?; @@ -73,7 +71,6 @@ fn simulate( if birth_time % simplify_interval == 0 { //buffer.pre_simplification(&mut tables)?; //tables.full_sort(tskit::TableSortOptions::default())?; - println!("{parents:?}"); node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); tskit::simplfify_from_buffer( children, @@ -87,7 +84,6 @@ fn simulate( *o = node_map[usize::try_from(*o)?]; assert!(!o.is_null()); } - tables.check_integrity(tskit::TableIntegrityCheckFlags::CHECK_EDGE_ORDERING)?; //if let Some(idmap) = // tables.simplify(children, tskit::SimplificationOptions::default(), true)? //{ diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 13c2b6ebb..e00ea5ee8 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -571,71 +571,55 @@ pub fn simplfify_from_buffer>( let parent = buffer.head.len() - i - 1; assert!(node_times[parent] >= last_parent_time); last_parent_time = node_times[parent].into(); - println!( - "foo: {},{} | {} {} | {}", - i, - parent, - *h, - buffer.head.len(), - last_parent_time - ); assert_ne!(parent, usize::MAX); - let mut edge_check: Vec<(NodeId, Position)> = vec![]; simplifier.add_edge( buffer.left[*h], buffer.right[*h], (parent as i32).into(), buffer.child[*h], )?; - edge_check.push((buffer.child[*h], buffer.left[*h])); let mut next = buffer.next[*h]; assert_ne!(next, *h); let parent = NodeId::from(parent as i32); assert!(parent >= 0); while next != usize::MAX { - println!("next={next:}, parent={parent:}"); - assert!(!edge_check - .iter() - .any(|x| *x == (buffer.child[next], buffer.left[next]))); simplifier.add_edge( buffer.left[next], buffer.right[next], parent, buffer.child[next], )?; - edge_check.push((buffer.child[next], buffer.left[next])); next = buffer.next[next]; } - println!("{edge_check:?}"); simplifier.merge_ancestors(parent)?; // major stress-test -- delete later - { - let l = tables.edges().left_slice(); - let p = tables.edges().parent_slice(); - let c = tables.edges().child_slice(); - let mut i = 0; - while i < l.len() { - let pi = p[i]; - while i < l.len() && p[i] == pi { - if i > 0 && c[i] == c[i - 1] { - assert_ne!( - l[i], - l[i - 1], - "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", - p[i], - p[i - 1], - c[i], - c[i - 1], - l[i], - l[i - 1], - edge_check - ); - } - i += 1; - } - } - } + //{ + // let l = tables.edges().left_slice(); + // let p = tables.edges().parent_slice(); + // let c = tables.edges().child_slice(); + // let mut i = 0; + // while i < l.len() { + // let pi = p[i]; + // while i < l.len() && p[i] == pi { + // if i > 0 && c[i] == c[i - 1] { + // assert_ne!( + // l[i], + // l[i - 1], + // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + // p[i], + // p[i - 1], + // c[i], + // c[i - 1], + // l[i], + // l[i - 1], + // edge_check + // ); + // } + // i += 1; + // } + // } + //} } } buffer.release_memory(); @@ -646,42 +630,41 @@ pub fn simplfify_from_buffer>( let p = parent[i]; assert!(node_times[p.as_usize()] >= last_parent_time); last_parent_time = node_times[p.as_usize()].into(); - let mut edge_check: Vec<(NodeId, Position)> = vec![]; + //let mut edge_check: Vec<(NodeId, Position)> = vec![]; while i < left.len() && parent[i] == p { - assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); + //assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); simplifier.add_edge(left[i], right[i], parent[i], child[i])?; - edge_check.push((child[i], left[i])); + //edge_check.push((child[i], left[i])); i += 1; } - println!("edge_check2: {edge_check:?}"); simplifier.merge_ancestors(p)?; // major stress-test -- delete later - { - let l = tables.edges().left_slice(); - let p = tables.edges().parent_slice(); - let c = tables.edges().child_slice(); - let mut i = 0; - while i < l.len() { - let pi = p[i]; - while i < l.len() && p[i] == pi { - if i > 0 && c[i] == c[i - 1] { - assert_ne!( - l[i], - l[i - 1], - "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", - p[i], - p[i - 1], - c[i], - c[i - 1], - l[i], - l[i - 1], - edge_check - ); - } - i += 1; - } - } - } + //{ + // let l = tables.edges().left_slice(); + // let p = tables.edges().parent_slice(); + // let c = tables.edges().child_slice(); + // let mut i = 0; + // while i < l.len() { + // let pi = p[i]; + // while i < l.len() && p[i] == pi { + // if i > 0 && c[i] == c[i - 1] { + // assert_ne!( + // l[i], + // l[i - 1], + // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + // p[i], + // p[i - 1], + // c[i], + // c[i - 1], + // l[i], + // l[i - 1], + // edge_check + // ); + // } + // i += 1; + // } + // } + //} } simplifier.finalise(node_map)?; diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 8490112ad..b3f1cfc97 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12904,29 +12904,7 @@ int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, } int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent) { - tsk_id_t * p = self->pimpl->simplifier.tables->edges.parent; - tsk_id_t * c = self->pimpl->simplifier.tables->edges.child; - double * l = self->pimpl->simplifier.tables->edges.left; - tsk_size_t num_edges = self->pimpl->simplifier.tables->edges.num_rows, i=1; - while (i < num_edges) { - if (i>0 && p[i] == p[i-1]) { - if(c[i]==c[i-1] && l[i]==l[i-1]) { - fprintf(stdout, "bailing early\n"); - abort(); } - } - i+=1; - } int ret = simplifier_merge_ancestors(&self->pimpl->simplifier, parent); - num_edges = self->pimpl->simplifier.tables->edges.num_rows; - i=1; - while (i < num_edges) { - if (i>0 && p[i] == p[i-1]) { - if(c[i]==c[i-1] && l[i]==l[i-1]) { - fprintf(stdout, "bailing %d %d %lf\n", p[i], c[i], l[i]); - abort(); } - } - i+=1; - } self->pimpl->simplifier.segment_queue_size = 0; return ret; } From 69550762c8d287b0e9b5ad1ce7398601c0c532a5 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:29:42 -0800 Subject: [PATCH 26/55] on our way to copy-free --- subprojects/tskit/tskit/tables.c | 18 +++++++++++++++++- subprojects/tskit/tskit/tables.h | 4 ++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index b3f1cfc97..ea56e9a7c 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12942,6 +12942,23 @@ int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_ return ret; } +const tsk_id_t * tsk_streaming_simplifier_get_input_parent(tsk_streaming_simplifier_t * self) +{ + return self->pimpl->simplifier.input_tables.edges.parent; +} +const tsk_id_t * tsk_streaming_simplifier_get_input_child(tsk_streaming_simplifier_t * self) +{ + return self->pimpl->simplifier.input_tables.edges.child; +} +const double * tsk_streaming_simplifier_get_input_left(tsk_streaming_simplifier_t * self) +{ + return self->pimpl->simplifier.input_tables.edges.left; +} + +tsk_size_t tsk_streaming_simplifier_get_num_input_edges(tsk_streaming_simplifier_t * self) { + return self->pimpl->simplifier.input_tables.edges.num_rows; +} + int tsk_streaming_simplifier_free(tsk_streaming_simplifier_t * self) { int ret = 0; ret = simplifier_free(&self->pimpl->simplifier); @@ -12952,4 +12969,3 @@ int tsk_streaming_simplifier_free(tsk_streaming_simplifier_t * self) { out: return ret; } - diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index 64384201f..5c8be298e 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -685,6 +685,10 @@ int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, double left, double right, tsk_id_t parent, tsk_id_t child); int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent); int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t *node_map); +const tsk_id_t * tsk_streaming_simplifier_get_input_parent(tsk_streaming_simplifier_t * self); +const tsk_id_t * tsk_streaming_simplifier_get_input_child(tsk_streaming_simplifier_t * self); +const double * tsk_streaming_simplifier_get_input_left(tsk_streaming_simplifier_t * self); +tsk_size_t tsk_streaming_simplifier_get_num_input_edges(tsk_streaming_simplifier_t * self); /****************************************************************************/ /* Common function options */ From d8df019aeaf4262becb7c1fc9669e39855963256 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:36:49 -0800 Subject: [PATCH 27/55] more back end --- src/edgebuffer.rs | 46 ++++++++++++++++++++++++++++++++ subprojects/tskit/tskit/tables.c | 12 ++++++--- subprojects/tskit/tskit/tables.h | 9 ++++--- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index e00ea5ee8..cb44095cf 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -530,6 +530,52 @@ impl StreamingSimplifier { unsafe { crate::bindings::tsk_streaming_simplifier_finalise(&mut self.simplifier, n) }; handle_tsk_return_value!(code, ()) } + + fn input_num_edges(&self) -> usize { + unsafe { + crate::bindings::tsk_streaming_simplifier_get_num_input_edges(&self.simplifier) as usize + } + } + + fn input_left(&self) -> &[Position] { + unsafe { + std::slice::from_raw_parts( + crate::bindings::tsk_streaming_simplifier_get_input_left(&self.simplifier) + .cast::(), + self.input_num_edges(), + ) + } + } + + fn input_right(&self) -> &[Position] { + unsafe { + std::slice::from_raw_parts( + crate::bindings::tsk_streaming_simplifier_get_input_right(&self.simplifier) + .cast::(), + self.input_num_edges(), + ) + } + } + + fn input_parent(&self) -> &[NodeId] { + unsafe { + std::slice::from_raw_parts( + crate::bindings::tsk_streaming_simplifier_get_input_parent(&self.simplifier) + .cast::(), + self.input_num_edges(), + ) + } + } + + fn input_child(&self) -> &[NodeId] { + unsafe { + std::slice::from_raw_parts( + crate::bindings::tsk_streaming_simplifier_get_input_child(&self.simplifier) + .cast::(), + self.input_num_edges(), + ) + } + } } impl Drop for StreamingSimplifier { diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index ea56e9a7c..4f64958b0 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12942,20 +12942,24 @@ int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_ return ret; } -const tsk_id_t * tsk_streaming_simplifier_get_input_parent(tsk_streaming_simplifier_t * self) +const tsk_id_t * tsk_streaming_simplifier_get_input_parent(const tsk_streaming_simplifier_t * self) { return self->pimpl->simplifier.input_tables.edges.parent; } -const tsk_id_t * tsk_streaming_simplifier_get_input_child(tsk_streaming_simplifier_t * self) +const tsk_id_t * tsk_streaming_simplifier_get_input_child(const tsk_streaming_simplifier_t * self) { return self->pimpl->simplifier.input_tables.edges.child; } -const double * tsk_streaming_simplifier_get_input_left(tsk_streaming_simplifier_t * self) +const double * tsk_streaming_simplifier_get_input_left(const tsk_streaming_simplifier_t * self) { return self->pimpl->simplifier.input_tables.edges.left; } +const double * tsk_streaming_simplifier_get_input_right(const tsk_streaming_simplifier_t * self) +{ + return self->pimpl->simplifier.input_tables.edges.right; +} -tsk_size_t tsk_streaming_simplifier_get_num_input_edges(tsk_streaming_simplifier_t * self) { +tsk_size_t tsk_streaming_simplifier_get_num_input_edges(const tsk_streaming_simplifier_t * self) { return self->pimpl->simplifier.input_tables.edges.num_rows; } diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index 5c8be298e..49a926769 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -685,10 +685,11 @@ int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, double left, double right, tsk_id_t parent, tsk_id_t child); int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent); int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t *node_map); -const tsk_id_t * tsk_streaming_simplifier_get_input_parent(tsk_streaming_simplifier_t * self); -const tsk_id_t * tsk_streaming_simplifier_get_input_child(tsk_streaming_simplifier_t * self); -const double * tsk_streaming_simplifier_get_input_left(tsk_streaming_simplifier_t * self); -tsk_size_t tsk_streaming_simplifier_get_num_input_edges(tsk_streaming_simplifier_t * self); +const tsk_id_t * tsk_streaming_simplifier_get_input_parent(const tsk_streaming_simplifier_t * self); +const tsk_id_t * tsk_streaming_simplifier_get_input_child(const tsk_streaming_simplifier_t * self); +const double * tsk_streaming_simplifier_get_input_left(const tsk_streaming_simplifier_t * self); +const double * tsk_streaming_simplifier_get_input_right(const tsk_streaming_simplifier_t * self); +tsk_size_t tsk_streaming_simplifier_get_num_input_edges(const tsk_streaming_simplifier_t * self); /****************************************************************************/ /* Common function options */ From 60520ac93d90f275c3e41a1f5e74267e86799b0b Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:41:03 -0800 Subject: [PATCH 28/55] borrow checker hates this --- src/edgebuffer.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index cb44095cf..db7b30f49 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -604,19 +604,12 @@ pub fn simplfify_from_buffer>( ) -> Result<(), TskitError> { // have to take copies of the current members of // the edge table. - let left = tables.edges().left_slice().to_vec(); - let right = tables.edges().right_slice().to_vec(); - let parent = tables.edges().parent_slice().to_vec(); - let child = tables.edges().child_slice().to_vec(); - let node_times = tables.nodes().time_slice().to_vec(); let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; let mut last_parent_time = -1.0; // Simplify the most recent births for (i, h) in buffer.head.iter().rev().enumerate() { if *h != usize::MAX { let parent = buffer.head.len() - i - 1; - assert!(node_times[parent] >= last_parent_time); - last_parent_time = node_times[parent].into(); assert_ne!(parent, usize::MAX); simplifier.add_edge( buffer.left[*h], @@ -672,10 +665,13 @@ pub fn simplfify_from_buffer>( // Simplify pre-existing edges. let mut i = 0; - while i < left.len() { + let num_input_edges = simplifier.input_num_edges(); + let left = simplifier.input_left(); + let right = simplifier.input_right(); + let parent = simplifier.input_parent(); + let child = simplifier.input_child(); + while i < num_input_edges { let p = parent[i]; - assert!(node_times[p.as_usize()] >= last_parent_time); - last_parent_time = node_times[p.as_usize()].into(); //let mut edge_check: Vec<(NodeId, Position)> = vec![]; while i < left.len() && parent[i] == p { //assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); From 746514432a1dcde39b178eb7ffa4189a39c59919 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 11:47:18 -0800 Subject: [PATCH 29/55] maybe on to something --- src/edgebuffer.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index db7b30f49..0c7e45c49 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -576,6 +576,19 @@ impl StreamingSimplifier { ) } } + + fn get_input_parent(&self, u: usize) -> NodeId { + self.input_parent()[u] + } + fn get_input_child(&self, u: usize) -> NodeId { + self.input_child()[u] + } + fn get_input_left(&self, u: usize) -> Position { + self.input_left()[u] + } + fn get_input_right(&self, u: usize) -> Position { + self.input_right()[u] + } } impl Drop for StreamingSimplifier { @@ -666,16 +679,17 @@ pub fn simplfify_from_buffer>( // Simplify pre-existing edges. let mut i = 0; let num_input_edges = simplifier.input_num_edges(); - let left = simplifier.input_left(); - let right = simplifier.input_right(); - let parent = simplifier.input_parent(); - let child = simplifier.input_child(); while i < num_input_edges { - let p = parent[i]; + let p = simplifier.get_input_parent(i); //let mut edge_check: Vec<(NodeId, Position)> = vec![]; - while i < left.len() && parent[i] == p { + while i < num_input_edges && simplifier.get_input_parent(i) == p { //assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); - simplifier.add_edge(left[i], right[i], parent[i], child[i])?; + simplifier.add_edge( + simplifier.get_input_left(i), + simplifier.get_input_right(i), + simplifier.get_input_parent(i), + simplifier.get_input_child(i), + )?; //edge_check.push((child[i], left[i])); i += 1; } From 510b7bb6e08c2af6e4156598384e6ae8e4e75470 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 14:13:32 -0800 Subject: [PATCH 30/55] tests fail for overlapping generations --- tests/test_edge_buffer.rs | 76 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 249dfd290..f97e6e4a3 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -69,6 +69,72 @@ fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequenc tables.tree_sequence(0.into()).unwrap() } +fn overlapping_generations_streaming_simplification( + seed: u64, + pdeath: f64, + simplify: i32, +) -> TreeSequence { + let mut tables = TableCollection::new(1.0).unwrap(); + let mut buffer = EdgeBuffer::default(); + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let popsize = 10; + + let mut parents = vec![]; + + for _ in 0..popsize { + let node = tables.add_node(0, 100.0, -1, -1).unwrap(); + parents.push(node); + } + + let death = rand::distributions::Uniform::new(0., 1.0); + let parent_picker = rand::distributions::Uniform::new(0, popsize); + let mut node_map: Vec = vec![]; + + for birth_time in (0..10).rev() { + let mut replacements = vec![]; + for i in 0..parents.len() { + if death.sample(&mut rng) <= pdeath { + replacements.push(i); + } + } + let mut births = vec![]; + + for _ in 0..replacements.len() { + let parent_index = parent_picker.sample(&mut rng); + let parent = parents[parent_index]; + let child = tables.add_node(0, birth_time as f64, -1, -1).unwrap(); + births.push(child); + buffer.buffer_birth(parent, child, 0., 1.).unwrap(); + } + + for (r, b) in replacements.iter().zip(births.iter()) { + assert!(*r < parents.len()); + parents[*r] = *b; + } + if birth_time % simplify == 0 { + node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); + tskit::simplfify_from_buffer( + &parents, + tskit::SimplificationOptions::default(), + &mut tables, + &mut buffer, + Some(&mut node_map), + ) + .unwrap(); + for o in parents.iter_mut() { + assert!(o.as_usize() < node_map.len()); + *o = node_map[usize::try_from(*o).unwrap()]; + assert!(!o.is_null()); + } + buffer.post_simplification(&parents, &mut tables).unwrap(); + } + } + + tables.build_index().unwrap(); + tables.tree_sequence(0.into()).unwrap() +} + #[cfg(test)] proptest! { #[test] @@ -78,3 +144,13 @@ proptest! { let _ = overlapping_generations(seed, pdeath, simplify_interval); } } + +#[cfg(test)] +proptest! { + #[test] + fn test_edge_buffer_overlapping_generations_streaming_simplification(seed in any::(), + pdeath in 0.05..1.0, + simplify_interval in 1..100i32) { + let _ = overlapping_generations_streaming_simplification(seed, pdeath, simplify_interval); + } +} From b90060e12a83f5c3c084057a8b09e55b18cf1cf1 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 14:22:50 -0800 Subject: [PATCH 31/55] some tidy, some messy --- tests/test_edge_buffer.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index f97e6e4a3..e1e93de70 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -18,7 +18,7 @@ fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequenc let mut parents = vec![]; for _ in 0..popsize { - let node = tables.add_node(0, 100.0, -1, -1).unwrap(); + let node = tables.add_node(0, 10.0, -1, -1).unwrap(); parents.push(node); } @@ -83,7 +83,7 @@ fn overlapping_generations_streaming_simplification( let mut parents = vec![]; for _ in 0..popsize { - let node = tables.add_node(0, 100.0, -1, -1).unwrap(); + let node = tables.add_node(0, 10.0, -1, -1).unwrap(); parents.push(node); } @@ -92,6 +92,7 @@ fn overlapping_generations_streaming_simplification( let mut node_map: Vec = vec![]; for birth_time in (0..10).rev() { + println!("birth time {birth_time:?}"); let mut replacements = vec![]; for i in 0..parents.len() { if death.sample(&mut rng) <= pdeath { @@ -102,6 +103,7 @@ fn overlapping_generations_streaming_simplification( for _ in 0..replacements.len() { let parent_index = parent_picker.sample(&mut rng); + assert!(parent_index < parents.len()); let parent = parents[parent_index]; let child = tables.add_node(0, birth_time as f64, -1, -1).unwrap(); births.push(child); @@ -113,6 +115,7 @@ fn overlapping_generations_streaming_simplification( parents[*r] = *b; } if birth_time % simplify == 0 { + println!("simplifying!"); node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); tskit::simplfify_from_buffer( &parents, @@ -122,15 +125,16 @@ fn overlapping_generations_streaming_simplification( Some(&mut node_map), ) .unwrap(); + println!("{parents:?}"); for o in parents.iter_mut() { assert!(o.as_usize() < node_map.len()); *o = node_map[usize::try_from(*o).unwrap()]; assert!(!o.is_null()); } + println!("remapped {parents:?}"); buffer.post_simplification(&parents, &mut tables).unwrap(); } } - tables.build_index().unwrap(); tables.tree_sequence(0.into()).unwrap() } From a8bcf57f7d56e23ec1ba9309d7af5376d3ac5c78 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 14:28:06 -0800 Subject: [PATCH 32/55] different failure point now --- src/edgebuffer.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 0c7e45c49..58e212b83 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -578,15 +578,19 @@ impl StreamingSimplifier { } fn get_input_parent(&self, u: usize) -> NodeId { + assert!(u < self.input_num_edges()); self.input_parent()[u] } fn get_input_child(&self, u: usize) -> NodeId { + assert!(u < self.input_num_edges()); self.input_child()[u] } fn get_input_left(&self, u: usize) -> Position { + assert!(u < self.input_num_edges()); self.input_left()[u] } fn get_input_right(&self, u: usize) -> Position { + assert!(u < self.input_num_edges()); self.input_right()[u] } } @@ -635,6 +639,7 @@ pub fn simplfify_from_buffer>( let parent = NodeId::from(parent as i32); assert!(parent >= 0); while next != usize::MAX { + assert!(next < buffer.left.len()); simplifier.add_edge( buffer.left[next], buffer.right[next], From 7669cd385d9e15439f8d074d2221ff9b5b0d8f87 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 16 Jan 2023 14:54:17 -0800 Subject: [PATCH 33/55] works for overlapping gens now --- src/edgebuffer.rs | 110 ++++++++++++++++++++------------------ tests/test_edge_buffer.rs | 5 +- 2 files changed, 59 insertions(+), 56 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 58e212b83..ad7064618 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -621,63 +621,69 @@ pub fn simplfify_from_buffer>( ) -> Result<(), TskitError> { // have to take copies of the current members of // the edge table. - let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; let mut last_parent_time = -1.0; + let mut head_index: Vec = buffer + .head + .iter() + .enumerate() + .filter(|(_, j)| **j != usize::MAX) + .map(|(i, _)| i) + .collect(); + + let node_time = tables.nodes().time_slice(); + head_index.sort_by(|a, b| node_time[*a].partial_cmp(&node_time[*b]).unwrap()); + let mut simplifier = StreamingSimplifier::new(samples, options, tables)?; // Simplify the most recent births - for (i, h) in buffer.head.iter().rev().enumerate() { - if *h != usize::MAX { - let parent = buffer.head.len() - i - 1; - assert_ne!(parent, usize::MAX); + //for (i, h) in buffer.head.iter().rev().enumerate() { + for h in head_index.into_iter() { + let parent = i32::try_from(h).unwrap(); + simplifier.add_edge( + buffer.left[buffer.head[h]], + buffer.right[buffer.head[h]], + parent.into(), + buffer.child[buffer.head[h]], + )?; + let mut next = buffer.next[buffer.head[h]]; + assert!(parent >= 0); + while next != usize::MAX { + assert!(next < buffer.left.len()); simplifier.add_edge( - buffer.left[*h], - buffer.right[*h], - (parent as i32).into(), - buffer.child[*h], + buffer.left[next], + buffer.right[next], + parent.into(), + buffer.child[next], )?; - let mut next = buffer.next[*h]; - assert_ne!(next, *h); - let parent = NodeId::from(parent as i32); - assert!(parent >= 0); - while next != usize::MAX { - assert!(next < buffer.left.len()); - simplifier.add_edge( - buffer.left[next], - buffer.right[next], - parent, - buffer.child[next], - )?; - next = buffer.next[next]; - } - simplifier.merge_ancestors(parent)?; - - // major stress-test -- delete later - //{ - // let l = tables.edges().left_slice(); - // let p = tables.edges().parent_slice(); - // let c = tables.edges().child_slice(); - // let mut i = 0; - // while i < l.len() { - // let pi = p[i]; - // while i < l.len() && p[i] == pi { - // if i > 0 && c[i] == c[i - 1] { - // assert_ne!( - // l[i], - // l[i - 1], - // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", - // p[i], - // p[i - 1], - // c[i], - // c[i - 1], - // l[i], - // l[i - 1], - // edge_check - // ); - // } - // i += 1; - // } - // } - //} + next = buffer.next[next]; } + simplifier.merge_ancestors(parent.into())?; + + // major stress-test -- delete later + //{ + // let l = tables.edges().left_slice(); + // let p = tables.edges().parent_slice(); + // let c = tables.edges().child_slice(); + // let mut i = 0; + // while i < l.len() { + // let pi = p[i]; + // while i < l.len() && p[i] == pi { + // if i > 0 && c[i] == c[i - 1] { + // assert_ne!( + // l[i], + // l[i - 1], + // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + // p[i], + // p[i - 1], + // c[i], + // c[i - 1], + // l[i], + // l[i - 1], + // edge_check + // ); + // } + // i += 1; + // } + // } + //} } buffer.release_memory(); diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index e1e93de70..904969042 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -92,7 +92,6 @@ fn overlapping_generations_streaming_simplification( let mut node_map: Vec = vec![]; for birth_time in (0..10).rev() { - println!("birth time {birth_time:?}"); let mut replacements = vec![]; for i in 0..parents.len() { if death.sample(&mut rng) <= pdeath { @@ -109,13 +108,13 @@ fn overlapping_generations_streaming_simplification( births.push(child); buffer.buffer_birth(parent, child, 0., 1.).unwrap(); } + assert_eq!(replacements.len(), births.len()); for (r, b) in replacements.iter().zip(births.iter()) { assert!(*r < parents.len()); parents[*r] = *b; } if birth_time % simplify == 0 { - println!("simplifying!"); node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); tskit::simplfify_from_buffer( &parents, @@ -125,13 +124,11 @@ fn overlapping_generations_streaming_simplification( Some(&mut node_map), ) .unwrap(); - println!("{parents:?}"); for o in parents.iter_mut() { assert!(o.as_usize() < node_map.len()); *o = node_map[usize::try_from(*o).unwrap()]; assert!(!o.is_null()); } - println!("remapped {parents:?}"); buffer.post_simplification(&parents, &mut tables).unwrap(); } } From a13f0d62ea07f8785101b1ba9b847567d02f6e02 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 11:21:33 -0800 Subject: [PATCH 34/55] start to abstract out so that we can directly compare results via tests --- tests/test_edge_buffer.rs | 121 ++++++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 25 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 904969042..e6156c745 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -5,20 +5,104 @@ use rand::distributions::Distribution; use rand::SeedableRng; use tskit::EdgeBuffer; +use tskit::EdgeId; +use tskit::NodeId; use tskit::TableCollection; use tskit::TreeSequence; +use tskit::TskitError; + +trait Recording { + fn add_node(&mut self, flags: u32, time: f64) -> Result; + fn add_edge( + &mut self, + left: f64, + right: f64, + parent: NodeId, + child: NodeId, + ) -> Result<(), TskitError>; + + fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError>; + fn start_recording(&mut self, parents: &[NodeId], child: &[NodeId]) {} + fn end_recording(&mut self) {} +} -fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequence { - let mut tables = TableCollection::new(1.0).unwrap(); - let mut buffer = EdgeBuffer::default(); +struct StandardTableCollectionWithBuffer { + tables: TableCollection, + buffer: EdgeBuffer, +} + +impl StandardTableCollectionWithBuffer { + fn new() -> Self { + Self { + tables: TableCollection::new(1.0).unwrap(), + buffer: EdgeBuffer::default(), + } + } +} + +impl Recording for StandardTableCollectionWithBuffer { + fn add_node(&mut self, flags: u32, time: f64) -> Result { + self.tables.add_node(flags, time, -1, -1) + } + + fn add_edge( + &mut self, + left: f64, + right: f64, + parent: NodeId, + child: NodeId, + ) -> Result<(), TskitError> { + self.buffer.record_birth(parent, child, left, right) + } + + fn start_recording(&mut self, parents: &[NodeId], children: &[NodeId]) { + self.buffer.setup_births(parents, children).unwrap() + } + + fn end_recording(&mut self) { + self.buffer.finalize_births() + } + + fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { + self.buffer.pre_simplification(&mut self.tables).unwrap(); + match self.tables.simplify(samples, 0, true) { + Ok(Some(idmap)) => { + for s in samples.iter_mut() { + *s = idmap[s.as_usize()]; + } + self.buffer + .post_simplification(samples, &mut self.tables) + .unwrap(); + Ok(()) + } + Ok(None) => panic!(), + Err(e) => Err(e), + } + } +} + +impl From for TreeSequence { + fn from(value: StandardTableCollectionWithBuffer) -> Self { + let mut value = value; + value.tables.build_index().unwrap(); + value.tables.tree_sequence(0.into()).unwrap() + } +} + +fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32, recorder: T) -> TreeSequence +where + T: Into + Recording, +{ let mut rng = rand::rngs::StdRng::seed_from_u64(seed); let popsize = 10; let mut parents = vec![]; + let mut recorder = recorder; + for _ in 0..popsize { - let node = tables.add_node(0, 10.0, -1, -1).unwrap(); + let node = recorder.add_node(0, 10.0).unwrap(); parents.push(node); } @@ -37,11 +121,11 @@ fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequenc for _ in 0..replacements.len() { let parent_index = parent_picker.sample(&mut rng); let parent = parents[parent_index]; - let child = tables.add_node(0, birth_time as f64, -1, -1).unwrap(); + let child = recorder.add_node(0, birth_time as f64).unwrap(); births.push(child); - buffer.setup_births(&[parent], &[child]).unwrap(); - buffer.record_birth(parent, child, 0., 1.).unwrap(); - buffer.finalize_births(); + recorder.start_recording(&[parent], &[child]); + recorder.add_edge(0., 1., parent, child).unwrap(); + recorder.end_recording(); } for (r, b) in replacements.iter().zip(births.iter()) { @@ -49,24 +133,10 @@ fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32) -> TreeSequenc parents[*r] = *b; } if birth_time % simplify == 0 { - buffer.pre_simplification(&mut tables).unwrap(); - //tables.full_sort(tskit::TableSortOptions::default()).unwrap(); - if let Some(idmap) = tables - .simplify(&parents, tskit::SimplificationOptions::default(), true) - .unwrap() - { - // remap child nodes - for o in parents.iter_mut() { - *o = idmap[usize::try_from(*o).unwrap()]; - } - } - buffer.post_simplification(&parents, &mut tables).unwrap(); + recorder.simplify(&mut parents).unwrap(); } } - - tables.build_index().unwrap(); - - tables.tree_sequence(0.into()).unwrap() + recorder.into() } fn overlapping_generations_streaming_simplification( @@ -142,7 +212,8 @@ proptest! { fn test_edge_buffer_overlapping_generations(seed in any::(), pdeath in 0.05..1.0, simplify_interval in 1..100i32) { - let _ = overlapping_generations(seed, pdeath, simplify_interval); + let with_buffer = StandardTableCollectionWithBuffer::new(); + let _ = overlapping_generations(seed, pdeath, simplify_interval, with_buffer); } } From 389adc6842b5904aee60fc6fc6a4cd94b91dc4ce Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 11:34:05 -0800 Subject: [PATCH 35/55] standard case --- tests/test_edge_buffer.rs | 67 +++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index e6156c745..c11050310 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -1,5 +1,7 @@ #![cfg(feature = "edgebuffer")] +use std::ops::Deref; + use proptest::prelude::*; use rand::distributions::Distribution; use rand::SeedableRng; @@ -26,12 +28,12 @@ trait Recording { fn end_recording(&mut self) {} } -struct StandardTableCollectionWithBuffer { +struct TableCollectionWithBuffer { tables: TableCollection, buffer: EdgeBuffer, } -impl StandardTableCollectionWithBuffer { +impl TableCollectionWithBuffer { fn new() -> Self { Self { tables: TableCollection::new(1.0).unwrap(), @@ -40,7 +42,7 @@ impl StandardTableCollectionWithBuffer { } } -impl Recording for StandardTableCollectionWithBuffer { +impl Recording for TableCollectionWithBuffer { fn add_node(&mut self, flags: u32, time: f64) -> Result { self.tables.add_node(flags, time, -1, -1) } @@ -81,14 +83,61 @@ impl Recording for StandardTableCollectionWithBuffer { } } -impl From for TreeSequence { - fn from(value: StandardTableCollectionWithBuffer) -> Self { +impl From for TreeSequence { + fn from(value: TableCollectionWithBuffer) -> Self { let mut value = value; value.tables.build_index().unwrap(); value.tables.tree_sequence(0.into()).unwrap() } } +struct StandardTableCollection(TableCollection); + +impl StandardTableCollection { + fn new() -> Self { + Self(TableCollection::new(1.0).unwrap()) + } +} + +impl Recording for StandardTableCollection { + fn add_node(&mut self, flags: u32, time: f64) -> Result { + self.0.add_node(flags, time, -1, -1) + } + fn add_edge( + &mut self, + left: f64, + right: f64, + parent: NodeId, + child: NodeId, + ) -> Result<(), TskitError> { + match self.0.add_edge(left, right, parent, child) { + Ok(_) => Ok(()), + Err(e) => Err(e), + } + } + fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { + self.0.full_sort(0).unwrap(); + match self.0.simplify(samples, 0, true) { + Ok(Some(idmap)) => { + for s in samples { + *s = idmap[s.as_usize()]; + } + Ok(()) + } + Ok(None) => Ok(()), + Err(e) => Err(e), + } + } +} + +impl From for TreeSequence { + fn from(value: StandardTableCollection) -> Self { + let mut value = value; + value.0.build_index(); + value.0.tree_sequence(0.into()).unwrap() + } +} + fn overlapping_generations(seed: u64, pdeath: f64, simplify: i32, recorder: T) -> TreeSequence where T: Into + Recording, @@ -212,8 +261,12 @@ proptest! { fn test_edge_buffer_overlapping_generations(seed in any::(), pdeath in 0.05..1.0, simplify_interval in 1..100i32) { - let with_buffer = StandardTableCollectionWithBuffer::new(); - let _ = overlapping_generations(seed, pdeath, simplify_interval, with_buffer); + let standard = StandardTableCollection::new(); + let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); + let with_buffer = TableCollectionWithBuffer::new(); + let standard_with_buffer = overlapping_generations(seed, pdeath, simplify_interval, with_buffer); + + assert_eq!(standard_treeseq.num_trees(), standard_with_buffer.num_trees()); } } From 7f571e63495424d31943e20570b1a95a582dc718 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 11:42:20 -0800 Subject: [PATCH 36/55] and now the new hotness --- tests/test_edge_buffer.rs | 70 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index c11050310..d4b4c425a 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -24,7 +24,7 @@ trait Recording { ) -> Result<(), TskitError>; fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError>; - fn start_recording(&mut self, parents: &[NodeId], child: &[NodeId]) {} + fn start_recording(&mut self, _parents: &[NodeId], _child: &[NodeId]) {} fn end_recording(&mut self) {} } @@ -99,6 +99,69 @@ impl StandardTableCollection { } } +struct TableCollectionWithBufferForStreaming { + tables: TableCollection, + buffer: EdgeBuffer, + node_map: Vec, +} + +impl TableCollectionWithBufferForStreaming { + fn new() -> Self { + Self { + tables: TableCollection::new(1.0).unwrap(), + buffer: EdgeBuffer::default(), + node_map: vec![], + } + } +} + +impl Recording for TableCollectionWithBufferForStreaming { + fn add_node(&mut self, flags: u32, time: f64) -> Result { + self.tables.add_node(flags, time, -1, -1) + } + fn add_edge( + &mut self, + left: f64, + right: f64, + parent: NodeId, + child: NodeId, + ) -> Result<(), TskitError> { + self.buffer.buffer_birth(parent, child, left, right) + } + + fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { + self.node_map.resize( + self.tables.nodes().num_rows().as_usize(), + tskit::NodeId::NULL, + ); + tskit::simplfify_from_buffer( + &samples, + tskit::SimplificationOptions::default(), + &mut self.tables, + &mut self.buffer, + Some(&mut self.node_map), + ) + .unwrap(); + for o in samples.iter_mut() { + assert!(o.as_usize() < self.node_map.len()); + *o = self.node_map[usize::try_from(*o).unwrap()]; + assert!(!o.is_null()); + } + self.buffer + .post_simplification(&samples, &mut self.tables) + .unwrap(); + Ok(()) + } +} + +impl From for TreeSequence { + fn from(value: TableCollectionWithBufferForStreaming) -> Self { + let mut value = value; + value.tables.build_index().unwrap(); + value.tables.tree_sequence(0.into()).unwrap() + } +} + impl Recording for StandardTableCollection { fn add_node(&mut self, flags: u32, time: f64) -> Result { self.0.add_node(flags, time, -1, -1) @@ -133,7 +196,7 @@ impl Recording for StandardTableCollection { impl From for TreeSequence { fn from(value: StandardTableCollection) -> Self { let mut value = value; - value.0.build_index(); + value.0.build_index().unwrap(); value.0.tree_sequence(0.into()).unwrap() } } @@ -265,8 +328,11 @@ proptest! { let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); let with_buffer = TableCollectionWithBuffer::new(); let standard_with_buffer = overlapping_generations(seed, pdeath, simplify_interval, with_buffer); + let with_buffer_streaming = TableCollectionWithBufferForStreaming::new(); + let standard_with_buffer_streaming = overlapping_generations(seed, pdeath, simplify_interval, with_buffer_streaming); assert_eq!(standard_treeseq.num_trees(), standard_with_buffer.num_trees()); + assert_eq!(standard_treeseq.num_trees(), standard_with_buffer_streaming.num_trees()); } } From 46f20d761a9afe5a5d8fa5883651bdf3c4cc4b18 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 11:57:49 -0800 Subject: [PATCH 37/55] getting there --- tests/test_edge_buffer.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index d4b4c425a..320987fd2 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -324,6 +324,7 @@ proptest! { fn test_edge_buffer_overlapping_generations(seed in any::(), pdeath in 0.05..1.0, simplify_interval in 1..100i32) { + use streaming_iterator::StreamingIterator; let standard = StandardTableCollection::new(); let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); let with_buffer = TableCollectionWithBuffer::new(); @@ -333,6 +334,20 @@ proptest! { assert_eq!(standard_treeseq.num_trees(), standard_with_buffer.num_trees()); assert_eq!(standard_treeseq.num_trees(), standard_with_buffer_streaming.num_trees()); + + // cannot do KC distance b/c trees not fully coalesced. + let mut trees_standard = standard_treeseq.tree_iterator(0).unwrap(); + let mut trees_with_buffer = standard_with_buffer.tree_iterator(0).unwrap(); + let mut trees_with_buffer_streaming = standard_with_buffer_streaming.tree_iterator(0).unwrap(); + + while let Some(tree) = trees_standard.next() { + let tree_with_buffer = trees_with_buffer.next().unwrap(); + assert_eq!(tree.interval(), tree_with_buffer.interval()); + //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer.total_branch_length(true).unwrap()); + let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); + assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); + //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); + } } } From cd1ecbefdb4db725118cb4efdd0ff9d5589757ee Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 11:59:26 -0800 Subject: [PATCH 38/55] delete reduntant test --- tests/test_edge_buffer.rs | 77 --------------------------------------- 1 file changed, 77 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 320987fd2..f2367090f 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -251,73 +251,6 @@ where recorder.into() } -fn overlapping_generations_streaming_simplification( - seed: u64, - pdeath: f64, - simplify: i32, -) -> TreeSequence { - let mut tables = TableCollection::new(1.0).unwrap(); - let mut buffer = EdgeBuffer::default(); - let mut rng = rand::rngs::StdRng::seed_from_u64(seed); - - let popsize = 10; - - let mut parents = vec![]; - - for _ in 0..popsize { - let node = tables.add_node(0, 10.0, -1, -1).unwrap(); - parents.push(node); - } - - let death = rand::distributions::Uniform::new(0., 1.0); - let parent_picker = rand::distributions::Uniform::new(0, popsize); - let mut node_map: Vec = vec![]; - - for birth_time in (0..10).rev() { - let mut replacements = vec![]; - for i in 0..parents.len() { - if death.sample(&mut rng) <= pdeath { - replacements.push(i); - } - } - let mut births = vec![]; - - for _ in 0..replacements.len() { - let parent_index = parent_picker.sample(&mut rng); - assert!(parent_index < parents.len()); - let parent = parents[parent_index]; - let child = tables.add_node(0, birth_time as f64, -1, -1).unwrap(); - births.push(child); - buffer.buffer_birth(parent, child, 0., 1.).unwrap(); - } - assert_eq!(replacements.len(), births.len()); - - for (r, b) in replacements.iter().zip(births.iter()) { - assert!(*r < parents.len()); - parents[*r] = *b; - } - if birth_time % simplify == 0 { - node_map.resize(tables.nodes().num_rows().as_usize(), tskit::NodeId::NULL); - tskit::simplfify_from_buffer( - &parents, - tskit::SimplificationOptions::default(), - &mut tables, - &mut buffer, - Some(&mut node_map), - ) - .unwrap(); - for o in parents.iter_mut() { - assert!(o.as_usize() < node_map.len()); - *o = node_map[usize::try_from(*o).unwrap()]; - assert!(!o.is_null()); - } - buffer.post_simplification(&parents, &mut tables).unwrap(); - } - } - tables.build_index().unwrap(); - tables.tree_sequence(0.into()).unwrap() -} - #[cfg(test)] proptest! { #[test] @@ -350,13 +283,3 @@ proptest! { } } } - -#[cfg(test)] -proptest! { - #[test] - fn test_edge_buffer_overlapping_generations_streaming_simplification(seed in any::(), - pdeath in 0.05..1.0, - simplify_interval in 1..100i32) { - let _ = overlapping_generations_streaming_simplification(seed, pdeath, simplify_interval); - } -} From 60d5b861a9989301f830c57a2a512941390a011d Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 12:01:13 -0800 Subject: [PATCH 39/55] branch lengths messing up? --- tests/test_edge_buffer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index f2367090f..17f2c3c27 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -91,6 +91,7 @@ impl From for TreeSequence { } } +#[repr(transparent)] struct StandardTableCollection(TableCollection); impl StandardTableCollection { @@ -276,7 +277,7 @@ proptest! { while let Some(tree) = trees_standard.next() { let tree_with_buffer = trees_with_buffer.next().unwrap(); assert_eq!(tree.interval(), tree_with_buffer.interval()); - //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer.total_branch_length(true).unwrap()); + assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer.total_branch_length(true).unwrap()); let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); From 27672fbee188d896137f828897a64a880f7a63b1 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 12:07:08 -0800 Subject: [PATCH 40/55] nope, still broken --- tests/test_edge_buffer.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 17f2c3c27..9538c3fdc 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -85,9 +85,10 @@ impl Recording for TableCollectionWithBuffer { impl From for TreeSequence { fn from(value: TableCollectionWithBuffer) -> Self { - let mut value = value; - value.tables.build_index().unwrap(); - value.tables.tree_sequence(0.into()).unwrap() + value + .tables + .tree_sequence(tskit::TreeSequenceFlags::BUILD_INDEXES) + .unwrap() } } @@ -157,9 +158,10 @@ impl Recording for TableCollectionWithBufferForStreaming { impl From for TreeSequence { fn from(value: TableCollectionWithBufferForStreaming) -> Self { - let mut value = value; - value.tables.build_index().unwrap(); - value.tables.tree_sequence(0.into()).unwrap() + value + .tables + .tree_sequence(tskit::TreeSequenceFlags::BUILD_INDEXES) + .unwrap() } } From 79f57e7552fc241f7f721bdb55599010b9b5a3aa Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 12:34:36 -0800 Subject: [PATCH 41/55] num edges is the problem --- tests/test_edge_buffer.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 9538c3fdc..71b1de9e1 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -270,6 +270,10 @@ proptest! { assert_eq!(standard_treeseq.num_trees(), standard_with_buffer.num_trees()); assert_eq!(standard_treeseq.num_trees(), standard_with_buffer_streaming.num_trees()); + assert_eq!(standard_treeseq.nodes().num_rows(), standard_with_buffer.nodes().num_rows()); + assert_eq!(standard_treeseq.nodes().num_rows(), standard_with_buffer_streaming.nodes().num_rows()); + assert_eq!(standard_treeseq.edges().num_rows(), standard_with_buffer.edges().num_rows()); + assert_eq!(standard_treeseq.edges().num_rows(), standard_with_buffer_streaming.edges().num_rows()); // cannot do KC distance b/c trees not fully coalesced. let mut trees_standard = standard_treeseq.tree_iterator(0).unwrap(); @@ -278,9 +282,10 @@ proptest! { while let Some(tree) = trees_standard.next() { let tree_with_buffer = trees_with_buffer.next().unwrap(); + let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); assert_eq!(tree.interval(), tree_with_buffer.interval()); + assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer.total_branch_length(true).unwrap()); - let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); } From 08f7056ea9eae4d8bdb85db1645d5399dd828066 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:00:11 -0800 Subject: [PATCH 42/55] refactor for fmt/lsp --- tests/test_edge_buffer.rs | 107 +++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 31 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 71b1de9e1..3a8157aaa 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -26,6 +26,9 @@ trait Recording { fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError>; fn start_recording(&mut self, _parents: &[NodeId], _child: &[NodeId]) {} fn end_recording(&mut self) {} + + // sugar that we don't need later + fn num_edges(&self) -> tskit::SizeType; } struct TableCollectionWithBuffer { @@ -81,10 +84,15 @@ impl Recording for TableCollectionWithBuffer { Err(e) => Err(e), } } + + fn num_edges(&self) -> tskit::SizeType { + self.tables.edges().num_rows() + } } impl From for TreeSequence { fn from(value: TableCollectionWithBuffer) -> Self { + println!("buffer export {}", value.tables.edges().num_rows()); value .tables .tree_sequence(tskit::TreeSequenceFlags::BUILD_INDEXES) @@ -121,6 +129,7 @@ impl Recording for TableCollectionWithBufferForStreaming { fn add_node(&mut self, flags: u32, time: f64) -> Result { self.tables.add_node(flags, time, -1, -1) } + fn add_edge( &mut self, left: f64, @@ -137,7 +146,7 @@ impl Recording for TableCollectionWithBufferForStreaming { tskit::NodeId::NULL, ); tskit::simplfify_from_buffer( - &samples, + samples, tskit::SimplificationOptions::default(), &mut self.tables, &mut self.buffer, @@ -154,6 +163,10 @@ impl Recording for TableCollectionWithBufferForStreaming { .unwrap(); Ok(()) } + + fn num_edges(&self) -> tskit::SizeType { + self.tables.edges().num_rows() + } } impl From for TreeSequence { @@ -194,6 +207,9 @@ impl Recording for StandardTableCollection { Err(e) => Err(e), } } + fn num_edges(&self) -> tskit::SizeType { + self.0.edges().num_rows() + } } impl From for TreeSequence { @@ -242,52 +258,81 @@ where recorder.add_edge(0., 1., parent, child).unwrap(); recorder.end_recording(); } - for (r, b) in replacements.iter().zip(births.iter()) { assert!(*r < parents.len()); parents[*r] = *b; } if birth_time % simplify == 0 { + println!("before simplify: {} {}", birth_time, recorder.num_edges()); recorder.simplify(&mut parents).unwrap(); + println!("simplify: {} {}", birth_time, recorder.num_edges()); } } + println!("end {}", recorder.num_edges()); recorder.into() } +fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i32) { + use streaming_iterator::StreamingIterator; + let standard = StandardTableCollection::new(); + let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); + let with_buffer = TableCollectionWithBuffer::new(); + let standard_with_buffer = + overlapping_generations(seed, pdeath, simplify_interval, with_buffer); + let with_buffer_streaming = TableCollectionWithBufferForStreaming::new(); + let standard_with_buffer_streaming = + overlapping_generations(seed, pdeath, simplify_interval, with_buffer_streaming); + + assert_eq!( + standard_treeseq.num_trees(), + standard_with_buffer.num_trees() + ); + assert_eq!( + standard_treeseq.num_trees(), + standard_with_buffer_streaming.num_trees() + ); + assert_eq!( + standard_treeseq.nodes().num_rows(), + standard_with_buffer.nodes().num_rows() + ); + assert_eq!( + standard_treeseq.nodes().num_rows(), + standard_with_buffer_streaming.nodes().num_rows() + ); + assert_eq!( + standard_treeseq.edges().num_rows(), + standard_with_buffer.edges().num_rows() + ); + assert_eq!( + standard_treeseq.edges().num_rows(), + standard_with_buffer_streaming.edges().num_rows() + ); + + // cannot do KC distance b/c trees not fully coalesced. + let mut trees_standard = standard_treeseq.tree_iterator(0).unwrap(); + let mut trees_with_buffer = standard_with_buffer.tree_iterator(0).unwrap(); + let mut trees_with_buffer_streaming = standard_with_buffer_streaming.tree_iterator(0).unwrap(); + + while let Some(tree) = trees_standard.next() { + let tree_with_buffer = trees_with_buffer.next().unwrap(); + let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); + assert_eq!(tree.interval(), tree_with_buffer.interval()); + assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); + assert_eq!( + tree.total_branch_length(true).unwrap(), + tree_with_buffer.total_branch_length(true).unwrap() + ); + assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); + //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); + } +} + #[cfg(test)] proptest! { #[test] fn test_edge_buffer_overlapping_generations(seed in any::(), pdeath in 0.05..1.0, simplify_interval in 1..100i32) { - use streaming_iterator::StreamingIterator; - let standard = StandardTableCollection::new(); - let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); - let with_buffer = TableCollectionWithBuffer::new(); - let standard_with_buffer = overlapping_generations(seed, pdeath, simplify_interval, with_buffer); - let with_buffer_streaming = TableCollectionWithBufferForStreaming::new(); - let standard_with_buffer_streaming = overlapping_generations(seed, pdeath, simplify_interval, with_buffer_streaming); - - assert_eq!(standard_treeseq.num_trees(), standard_with_buffer.num_trees()); - assert_eq!(standard_treeseq.num_trees(), standard_with_buffer_streaming.num_trees()); - assert_eq!(standard_treeseq.nodes().num_rows(), standard_with_buffer.nodes().num_rows()); - assert_eq!(standard_treeseq.nodes().num_rows(), standard_with_buffer_streaming.nodes().num_rows()); - assert_eq!(standard_treeseq.edges().num_rows(), standard_with_buffer.edges().num_rows()); - assert_eq!(standard_treeseq.edges().num_rows(), standard_with_buffer_streaming.edges().num_rows()); - - // cannot do KC distance b/c trees not fully coalesced. - let mut trees_standard = standard_treeseq.tree_iterator(0).unwrap(); - let mut trees_with_buffer = standard_with_buffer.tree_iterator(0).unwrap(); - let mut trees_with_buffer_streaming = standard_with_buffer_streaming.tree_iterator(0).unwrap(); - - while let Some(tree) = trees_standard.next() { - let tree_with_buffer = trees_with_buffer.next().unwrap(); - let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); - assert_eq!(tree.interval(), tree_with_buffer.interval()); - assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); - assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer.total_branch_length(true).unwrap()); - assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); - //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); - } + run_overlapping_generations_test(seed, pdeath, simplify_interval) } } From d411782a134ea9a738f08c01b8639ced1bb53229 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:05:35 -0800 Subject: [PATCH 43/55] improve match in tests --- tests/test_edge_buffer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 3a8157aaa..1a14a174b 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -203,7 +203,7 @@ impl Recording for StandardTableCollection { } Ok(()) } - Ok(None) => Ok(()), + Ok(None) => panic!("need to remap input sample nodes"), Err(e) => Err(e), } } From bed6ebfb5c86a65fc493723cf44671c20c723352 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:15:41 -0800 Subject: [PATCH 44/55] tests pass --- tests/test_edge_buffer.rs | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 1a14a174b..6ac82b989 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -1,13 +1,10 @@ #![cfg(feature = "edgebuffer")] -use std::ops::Deref; - use proptest::prelude::*; use rand::distributions::Distribution; use rand::SeedableRng; use tskit::EdgeBuffer; -use tskit::EdgeId; use tskit::NodeId; use tskit::TableCollection; use tskit::TreeSequence; @@ -24,6 +21,9 @@ trait Recording { ) -> Result<(), TskitError>; fn simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError>; + fn post_simplify(&mut self, _samples: &mut [NodeId]) -> Result<(), TskitError> { + Ok(()) + } fn start_recording(&mut self, _parents: &[NodeId], _child: &[NodeId]) {} fn end_recording(&mut self) {} @@ -75,9 +75,6 @@ impl Recording for TableCollectionWithBuffer { for s in samples.iter_mut() { *s = idmap[s.as_usize()]; } - self.buffer - .post_simplification(samples, &mut self.tables) - .unwrap(); Ok(()) } Ok(None) => panic!(), @@ -85,6 +82,10 @@ impl Recording for TableCollectionWithBuffer { } } + fn post_simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { + self.buffer.post_simplification(samples, &mut self.tables) + } + fn num_edges(&self) -> tskit::SizeType { self.tables.edges().num_rows() } @@ -92,7 +93,6 @@ impl Recording for TableCollectionWithBuffer { impl From for TreeSequence { fn from(value: TableCollectionWithBuffer) -> Self { - println!("buffer export {}", value.tables.edges().num_rows()); value .tables .tree_sequence(tskit::TreeSequenceFlags::BUILD_INDEXES) @@ -158,12 +158,13 @@ impl Recording for TableCollectionWithBufferForStreaming { *o = self.node_map[usize::try_from(*o).unwrap()]; assert!(!o.is_null()); } - self.buffer - .post_simplification(&samples, &mut self.tables) - .unwrap(); Ok(()) } + fn post_simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { + self.buffer.post_simplification(samples, &mut self.tables) + } + fn num_edges(&self) -> tskit::SizeType { self.tables.edges().num_rows() } @@ -263,12 +264,12 @@ where parents[*r] = *b; } if birth_time % simplify == 0 { - println!("before simplify: {} {}", birth_time, recorder.num_edges()); recorder.simplify(&mut parents).unwrap(); - println!("simplify: {} {}", birth_time, recorder.num_edges()); + if birth_time > 0 { + recorder.post_simplify(&mut parents).unwrap(); + } } } - println!("end {}", recorder.num_edges()); recorder.into() } @@ -323,7 +324,12 @@ fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i tree_with_buffer.total_branch_length(true).unwrap() ); assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); - //assert_eq!(tree.total_branch_length(true).unwrap(), tree_with_buffer_streaming.total_branch_length(true).unwrap()); + assert_eq!( + tree.total_branch_length(true).unwrap(), + tree_with_buffer_streaming + .total_branch_length(true) + .unwrap() + ); } } From feaadf15e8d9228fb0ea148a9cc0da8dd978aac0 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:17:28 -0800 Subject: [PATCH 45/55] remove uneeded test fn in trait --- tests/test_edge_buffer.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 6ac82b989..57b838a9b 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -26,9 +26,6 @@ trait Recording { } fn start_recording(&mut self, _parents: &[NodeId], _child: &[NodeId]) {} fn end_recording(&mut self) {} - - // sugar that we don't need later - fn num_edges(&self) -> tskit::SizeType; } struct TableCollectionWithBuffer { @@ -85,10 +82,6 @@ impl Recording for TableCollectionWithBuffer { fn post_simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { self.buffer.post_simplification(samples, &mut self.tables) } - - fn num_edges(&self) -> tskit::SizeType { - self.tables.edges().num_rows() - } } impl From for TreeSequence { @@ -164,10 +157,6 @@ impl Recording for TableCollectionWithBufferForStreaming { fn post_simplify(&mut self, samples: &mut [NodeId]) -> Result<(), TskitError> { self.buffer.post_simplification(samples, &mut self.tables) } - - fn num_edges(&self) -> tskit::SizeType { - self.tables.edges().num_rows() - } } impl From for TreeSequence { @@ -208,9 +197,6 @@ impl Recording for StandardTableCollection { Err(e) => Err(e), } } - fn num_edges(&self) -> tskit::SizeType { - self.0.edges().num_rows() - } } impl From for TreeSequence { From 14d33bf21958fd2dc17efbd3a2de0b568e498d77 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:27:24 -0800 Subject: [PATCH 46/55] dedup the tests --- tests/test_edge_buffer.rs | 69 ++++++++++++--------------------------- 1 file changed, 21 insertions(+), 48 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 57b838a9b..cc9d062f3 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -259,8 +259,26 @@ where recorder.into() } -fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i32) { +fn compare_treeseqs(a: &TreeSequence, b: &TreeSequence) { use streaming_iterator::StreamingIterator; + assert_eq!(a.edges().num_rows(), b.edges().num_rows()); + assert_eq!(a.nodes().num_rows(), b.nodes().num_rows()); + assert_eq!(a.num_trees(), b.num_trees()); + + let mut trees_a = a.tree_iterator(0).unwrap(); + let mut trees_b = b.tree_iterator(0).unwrap(); + + while let Some(tree_a) = trees_a.next() { + let tree_b = trees_b.next().unwrap(); + assert_eq!(tree_a.interval(), tree_b.interval()); + assert_eq!( + tree_a.total_branch_length(true).unwrap(), + tree_b.total_branch_length(true).unwrap() + ); + } +} + +fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i32) { let standard = StandardTableCollection::new(); let standard_treeseq = overlapping_generations(seed, pdeath, simplify_interval, standard); let with_buffer = TableCollectionWithBuffer::new(); @@ -270,53 +288,8 @@ fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i let standard_with_buffer_streaming = overlapping_generations(seed, pdeath, simplify_interval, with_buffer_streaming); - assert_eq!( - standard_treeseq.num_trees(), - standard_with_buffer.num_trees() - ); - assert_eq!( - standard_treeseq.num_trees(), - standard_with_buffer_streaming.num_trees() - ); - assert_eq!( - standard_treeseq.nodes().num_rows(), - standard_with_buffer.nodes().num_rows() - ); - assert_eq!( - standard_treeseq.nodes().num_rows(), - standard_with_buffer_streaming.nodes().num_rows() - ); - assert_eq!( - standard_treeseq.edges().num_rows(), - standard_with_buffer.edges().num_rows() - ); - assert_eq!( - standard_treeseq.edges().num_rows(), - standard_with_buffer_streaming.edges().num_rows() - ); - - // cannot do KC distance b/c trees not fully coalesced. - let mut trees_standard = standard_treeseq.tree_iterator(0).unwrap(); - let mut trees_with_buffer = standard_with_buffer.tree_iterator(0).unwrap(); - let mut trees_with_buffer_streaming = standard_with_buffer_streaming.tree_iterator(0).unwrap(); - - while let Some(tree) = trees_standard.next() { - let tree_with_buffer = trees_with_buffer.next().unwrap(); - let tree_with_buffer_streaming = trees_with_buffer_streaming.next().unwrap(); - assert_eq!(tree.interval(), tree_with_buffer.interval()); - assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); - assert_eq!( - tree.total_branch_length(true).unwrap(), - tree_with_buffer.total_branch_length(true).unwrap() - ); - assert_eq!(tree.interval(), tree_with_buffer_streaming.interval()); - assert_eq!( - tree.total_branch_length(true).unwrap(), - tree_with_buffer_streaming - .total_branch_length(true) - .unwrap() - ); - } + compare_treeseqs(&standard_treeseq, &standard_with_buffer); + compare_treeseqs(&standard_treeseq, &standard_with_buffer_streaming); } #[cfg(test)] From 91463a1445625fa9cc2ab5e77d86ff534a1ecb07 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:31:14 -0800 Subject: [PATCH 47/55] add crossover to the test --- tests/test_edge_buffer.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index cc9d062f3..6d6bf84a4 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -226,6 +226,7 @@ where let death = rand::distributions::Uniform::new(0., 1.0); let parent_picker = rand::distributions::Uniform::new(0, popsize); + let breakpoint_generator = rand::distributions::Uniform::new(0.0, 1.0); for birth_time in (0..10).rev() { let mut replacements = vec![]; @@ -239,10 +240,16 @@ where for _ in 0..replacements.len() { let parent_index = parent_picker.sample(&mut rng); let parent = parents[parent_index]; + let parent_index = parent_picker.sample(&mut rng); + let parent2 = parents[parent_index]; let child = recorder.add_node(0, birth_time as f64).unwrap(); births.push(child); - recorder.start_recording(&[parent], &[child]); - recorder.add_edge(0., 1., parent, child).unwrap(); + let child2 = recorder.add_node(0, birth_time as f64).unwrap(); + births.push(child2); + let breakpoint = breakpoint_generator.sample(&mut rng); + recorder.start_recording(&[parent, parent2], &[child, child2]); + recorder.add_edge(0., breakpoint, parent, child).unwrap(); + recorder.add_edge(breakpoint, 1., parent, child2).unwrap(); recorder.end_recording(); } for (r, b) in replacements.iter().zip(births.iter()) { From efb9bda791bd9995e03d4056d30a19e631ef63b3 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:49:37 -0800 Subject: [PATCH 48/55] one fix due to a failing proptest --- tests/test_edge_buffer.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 6d6bf84a4..74e9dc8ad 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -244,12 +244,10 @@ where let parent2 = parents[parent_index]; let child = recorder.add_node(0, birth_time as f64).unwrap(); births.push(child); - let child2 = recorder.add_node(0, birth_time as f64).unwrap(); - births.push(child2); let breakpoint = breakpoint_generator.sample(&mut rng); - recorder.start_recording(&[parent, parent2], &[child, child2]); + recorder.start_recording(&[parent, parent2], &[child]); recorder.add_edge(0., breakpoint, parent, child).unwrap(); - recorder.add_edge(breakpoint, 1., parent, child2).unwrap(); + recorder.add_edge(breakpoint, 1., parent2, child).unwrap(); recorder.end_recording(); } for (r, b) in replacements.iter().zip(births.iter()) { @@ -299,6 +297,11 @@ fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i compare_treeseqs(&standard_treeseq, &standard_with_buffer_streaming); } +#[test] +fn failing_test_params() { + run_overlapping_generations_test(3491384373429438832, 0.49766542321295254, 1) +} + #[cfg(test)] proptest! { #[test] From ba3e68e9d8d24d84ff9fb48a59ae3ec79a7e497e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 13:56:32 -0800 Subject: [PATCH 49/55] fix error in registering children during post-simplification buffering for overlapping generations --- src/edgebuffer.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index ad7064618..f177a144e 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -357,7 +357,10 @@ impl EdgeBuffer { let right = tables.edges().right_slice(); let mut rv = 0; for pre in pre_existing_edges.iter() { - self.setup_births(&[parent[pre.first]], &child[pre.first..pre.last])?; + let mut uchild = child[pre.first..pre.last].to_owned(); + uchild.sort(); + uchild.dedup(); + self.setup_births(&[parent[pre.first]], &uchild)?; for e in pre.first..pre.last { assert_eq!(parent[e], parent[pre.first]); self.record_birth(parent[e], child[e], left[e], right[e])?; From abed0bece7f1ba93bf40506c5c0945f2c6ecb3ac Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 14:01:58 -0800 Subject: [PATCH 50/55] remove duplicated effort --- src/edgebuffer.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index f177a144e..94ae1868a 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -73,6 +73,7 @@ impl BufferedBirths { fn initialize(&mut self, parents: &[NodeId], children: &[NodeId]) -> Result<(), TskitError> { self.children = children.to_vec(); self.children.sort(); + self.children.dedup(); self.segments.clear(); // FIXME: don't do this work if the parent already exists for p in parents { @@ -357,10 +358,7 @@ impl EdgeBuffer { let right = tables.edges().right_slice(); let mut rv = 0; for pre in pre_existing_edges.iter() { - let mut uchild = child[pre.first..pre.last].to_owned(); - uchild.sort(); - uchild.dedup(); - self.setup_births(&[parent[pre.first]], &uchild)?; + self.setup_births(&[parent[pre.first]], &child[pre.first..pre.last])?; for e in pre.first..pre.last { assert_eq!(parent[e], parent[pre.first]); self.record_birth(parent[e], child[e], left[e], right[e])?; From 937528b239c1780d58d2d8abbf902f42576c1931 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 14:06:46 -0800 Subject: [PATCH 51/55] ; --- tests/test_edge_buffer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 74e9dc8ad..7c4084837 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -299,7 +299,7 @@ fn run_overlapping_generations_test(seed: u64, pdeath: f64, simplify_interval: i #[test] fn failing_test_params() { - run_overlapping_generations_test(3491384373429438832, 0.49766542321295254, 1) + run_overlapping_generations_test(3491384373429438832, 0.49766542321295254, 1); } #[cfg(test)] From f554b0ecbf558f89217eb4842850f58339700b13 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 17 Jan 2023 14:09:15 -0800 Subject: [PATCH 52/55] note for the future --- src/edgebuffer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index 94ae1868a..f3e31610c 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -347,6 +347,12 @@ impl EdgeBuffer { // we are buffering nodes with increasing node id // that are also more ancient. This is the opposite // order from what happens during a forward-time simulation. + // NOTE: the mechanics of this fn differ if we use + // "regular" simplification or streaming! + // For the former case, we have to do the setup/finalize + // business. For the latter, WE DO NOT. + // This differences suggests there are actually two types/impls + // being discussed here. fn buffer_existing_edges( &mut self, pre_existing_edges: Vec, From 4b11d758ccdef9658f653d8b8d4d5098a38b4e83 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 18 Jan 2023 08:47:49 -0800 Subject: [PATCH 53/55] make popsize/nsteps top-level variables in test --- tests/test_edge_buffer.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_edge_buffer.rs b/tests/test_edge_buffer.rs index 7c4084837..ab27673dd 100644 --- a/tests/test_edge_buffer.rs +++ b/tests/test_edge_buffer.rs @@ -214,13 +214,14 @@ where let mut rng = rand::rngs::StdRng::seed_from_u64(seed); let popsize = 10; + let nsteps = 10; let mut parents = vec![]; let mut recorder = recorder; for _ in 0..popsize { - let node = recorder.add_node(0, 10.0).unwrap(); + let node = recorder.add_node(0, nsteps as f64).unwrap(); parents.push(node); } @@ -228,7 +229,7 @@ where let parent_picker = rand::distributions::Uniform::new(0, popsize); let breakpoint_generator = rand::distributions::Uniform::new(0.0, 1.0); - for birth_time in (0..10).rev() { + for birth_time in (0..nsteps).rev() { let mut replacements = vec![]; for i in 0..parents.len() { if death.sample(&mut rng) <= pdeath { From 72392322d6b88842676318ea417b37fbc350285a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 18 Jan 2023 08:50:57 -0800 Subject: [PATCH 54/55] update TODO list --- src/edgebuffer.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index f3e31610c..b6b69537c 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -424,7 +424,11 @@ impl EdgeBuffer { // 4. We are doing this in the wrong temporal order. // We need to pre-process all existing edge intervals, // cache them, then go backwards through them, - // so that we buffer them present-to-past + // so that we buffer them present-to-past. + // DONE + // 5. This step should be EARLY in a recording epoch, + // so that we avoid the gotcha of stealing edges + // from the last generation of a simulation. pub fn post_simplification( &mut self, alive: &[NodeId], From 86bd18c6d86d8db72b8578d35b1acf4b26fec49c Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 18 Jan 2023 15:16:03 -0800 Subject: [PATCH 55/55] simplify things a lot --- src/edgebuffer.rs | 90 ++++++++++++++++---------------- subprojects/tskit/tskit/tables.c | 55 +++++++++---------- subprojects/tskit/tskit/tables.h | 5 ++ 3 files changed, 78 insertions(+), 72 deletions(-) diff --git a/src/edgebuffer.rs b/src/edgebuffer.rs index b6b69537c..200a2a62f 100644 --- a/src/edgebuffer.rs +++ b/src/edgebuffer.rs @@ -699,51 +699,51 @@ pub fn simplfify_from_buffer>( buffer.release_memory(); // Simplify pre-existing edges. - let mut i = 0; - let num_input_edges = simplifier.input_num_edges(); - while i < num_input_edges { - let p = simplifier.get_input_parent(i); - //let mut edge_check: Vec<(NodeId, Position)> = vec![]; - while i < num_input_edges && simplifier.get_input_parent(i) == p { - //assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); - simplifier.add_edge( - simplifier.get_input_left(i), - simplifier.get_input_right(i), - simplifier.get_input_parent(i), - simplifier.get_input_child(i), - )?; - //edge_check.push((child[i], left[i])); - i += 1; - } - simplifier.merge_ancestors(p)?; - // major stress-test -- delete later - //{ - // let l = tables.edges().left_slice(); - // let p = tables.edges().parent_slice(); - // let c = tables.edges().child_slice(); - // let mut i = 0; - // while i < l.len() { - // let pi = p[i]; - // while i < l.len() && p[i] == pi { - // if i > 0 && c[i] == c[i - 1] { - // assert_ne!( - // l[i], - // l[i - 1], - // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", - // p[i], - // p[i - 1], - // c[i], - // c[i - 1], - // l[i], - // l[i - 1], - // edge_check - // ); - // } - // i += 1; - // } - // } - //} - } + //let mut i = 0; + //let num_input_edges = simplifier.input_num_edges(); + //while i < num_input_edges { + // let p = simplifier.get_input_parent(i); + // //let mut edge_check: Vec<(NodeId, Position)> = vec![]; + // while i < num_input_edges && simplifier.get_input_parent(i) == p { + // //assert!(!edge_check.iter().any(|x| *x == (child[i], left[i]))); + // simplifier.add_edge( + // simplifier.get_input_left(i), + // simplifier.get_input_right(i), + // simplifier.get_input_parent(i), + // simplifier.get_input_child(i), + // )?; + // //edge_check.push((child[i], left[i])); + // i += 1; + // } + // simplifier.merge_ancestors(p)?; + // // major stress-test -- delete later + // //{ + // // let l = tables.edges().left_slice(); + // // let p = tables.edges().parent_slice(); + // // let c = tables.edges().child_slice(); + // // let mut i = 0; + // // while i < l.len() { + // // let pi = p[i]; + // // while i < l.len() && p[i] == pi { + // // if i > 0 && c[i] == c[i - 1] { + // // assert_ne!( + // // l[i], + // // l[i - 1], + // // "{:?},{:?} | {:?},{:?} | {:?},{:?} => {:?}", + // // p[i], + // // p[i - 1], + // // c[i], + // // c[i - 1], + // // l[i], + // // l[i - 1], + // // edge_check + // // ); + // // } + // // i += 1; + // // } + // // } + // //} + //} simplifier.finalise(node_map)?; Ok(()) diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 4f64958b0..52c84e333 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -12912,33 +12912,34 @@ int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t * node_map) { int ret = 0; simplifier_t * simplifier = &self->pimpl->simplifier; - if (simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS) { - ret = simplifier_insert_input_roots(simplifier); - if (ret != 0) { - goto out; - } - } - ret = simplifier_output_sites(simplifier); - if (ret != 0) { - goto out; - } - ret = simplifier_finalise_references(simplifier); - if (ret != 0) { - goto out; - } - if (node_map != NULL) { - /* Finally, output the new IDs for the nodes, if required. */ - tsk_memcpy(node_map, simplifier->node_id_map, - simplifier->input_tables.nodes.num_rows * sizeof(tsk_id_t)); - } - if (simplifier->edge_sort_offset != TSK_NULL) { - tsk_bug_assert(simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS); - ret = simplifier_sort_edges(simplifier); - if (ret != 0) { - goto out; - } - } -out: + ret = simplifier_run(simplifier, node_map); + //if (simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS) { + // ret = simplifier_insert_input_roots(simplifier); + // if (ret != 0) { + // goto out; + // } + //} + //ret = simplifier_output_sites(simplifier); + //if (ret != 0) { + // goto out; + //} + //ret = simplifier_finalise_references(simplifier); + //if (ret != 0) { + // goto out; + //} + //if (node_map != NULL) { + // /* Finally, output the new IDs for the nodes, if required. */ + // tsk_memcpy(node_map, simplifier->node_id_map, + // simplifier->input_tables.nodes.num_rows * sizeof(tsk_id_t)); + //} + //if (simplifier->edge_sort_offset != TSK_NULL) { + // tsk_bug_assert(simplifier->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS); + // ret = simplifier_sort_edges(simplifier); + // if (ret != 0) { + // goto out; + // } + //} +//out: return ret; } diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index 49a926769..397c8c8ac 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -684,7 +684,12 @@ int tsk_streaming_simplifier_free(tsk_streaming_simplifier_t * self); int tsk_streaming_simplifier_add_edge(tsk_streaming_simplifier_t * self, double left, double right, tsk_id_t parent, tsk_id_t child); int tsk_streaming_simplifier_merge_ancestors(tsk_streaming_simplifier_t * self, tsk_id_t parent); + +// runs the simplifier, thus processing ancient edges +// present in the input edge table. int tsk_streaming_simplifier_finalise(tsk_streaming_simplifier_t * self, tsk_id_t *node_map); + +// None of this is needed anymore. const tsk_id_t * tsk_streaming_simplifier_get_input_parent(const tsk_streaming_simplifier_t * self); const tsk_id_t * tsk_streaming_simplifier_get_input_child(const tsk_streaming_simplifier_t * self); const double * tsk_streaming_simplifier_get_input_left(const tsk_streaming_simplifier_t * self);