Merge branch 'quadratic-runtime'

eblerjana · Jan 19, 2023 · 8674d3a · 8674d3a
2 parents c48a7b6 + 5ea3d77
commit 8674d3a
Show file tree

Hide file tree

Showing 13 changed files with 282 additions and 533 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ A short-read genotyper for various types of genetic variants (such as SNPs, inde
 ## Installation
 
 
-### Building from source using Singularity
+### Building from source using Singularity (recommended)
 
 Use the Singularity definition file located in ``container/`` to build an (Ubuntu-based) container as follows (requires root privileges):
 
@@ -127,16 +127,12 @@ options:
 
 ## Runtime and memory usage
 
-Runtime and memory usage depend on the number of variants genotyped and the number of haplotypes present in the graph.
+Runtime and memory usage depend on the number of variants genotyped and the number of haplotypes present in the graph. PanGenie is fastest when it is installed using Singularity (see above).
 
-With the data described here: https://doi.org/10.1038/s41588-022-01043-w, PanGenie ran in 1 hour and 25 minutes walltime using 22 cores (16 CPU hours) and used 68 GB RAM.
-The largest dataset that we have tested contained around 16M variants, 64 haplotypes and around 30x read coverage. Using 24 cores, PanGenie run in 1 hour and 46 minutes (24 CPU hours) and used 120 GB of RAM.
+With the data described here: https://doi.org/10.1038/s41588-022-01043-w, PanGenie ran in 1 hour and 5 minutes walltime using 24 cores (16 CPU hours) and used 68 GB RAM.
+The largest dataset that we have tested (HPRC: https://doi.org/10.1101/2022.07.09.499321) contained around 27 million variants, 88 haplotypes and around 30x read coverage. Using 24 cores, PanGenie run in 1 hour and 57 minutes (19 CPU hours) and used 86 GB of RAM.
 
 
-## Notes
-
-The largest panel we have run PanGenie on so far consisted of 44 samples (88 haplotypes). On this data, PanGenie needed 53 CPU hours (03:15 h wallclock time using 24 cores) and 153 GB of memory in order to genotype 20,661,169 variants.
-
 ## Limitations
 
 The runtime of PanGenie gets slow as the number of haplotype paths increases. Due to technical reasons, the current implementation of PanGenie cannot handle more than 254 input haplotypes (127 diploid samples).

diff --git a/src/hmm.cpp b/src/hmm.cpp
@@ -187,6 +187,8 @@ void HMM::compute_viterbi_path() {
 }
 
 void HMM::compute_forward_column(size_t column_index) {
+	// NOTE: this implementation assumes that all variant positions are covered by the same set of paths
+
 	assert(column_index < this->column_indexers.size());
 	size_t variant_id = this->column_indexers.at(column_index)->get_variant_id();
 
@@ -212,7 +214,6 @@ void HMM::compute_forward_column(size_t column_index) {
 		size_t prev_pos = this->unique_kmers->at(prev_index)->get_variant_position();
 		size_t cur_pos = this->unique_kmers->at(cur_index)->get_variant_position();
 		transition_probability_computer = new TransitionProbabilityComputer(prev_pos, cur_pos, this->recombrate, nr_paths, this->uniform, this->effective_N);
-
 	}
 
 	// construct new column
@@ -221,41 +222,48 @@ void HMM::compute_forward_column(size_t column_index) {
 	// emission probability computer
 	EmissionProbabilityComputer emission_probability_computer(this->unique_kmers->at(variant_id), this->probabilities);
 
+	// pre-compute helper variables
+	unsigned short nr_prev_paths = 0;
+	if (column_index > 0) {
+		nr_prev_paths = previous_indexer->nr_paths();
+		// the assumption is that all variants are covered by the same set of paths, therefore these numbers must be equal
+		assert(nr_prev_paths == nr_paths);
+	}
+
+	vector<long double> helper_i(nr_prev_paths);
+	vector<long double> helper_j(nr_prev_paths);
+	long double helper_ij = 0.0;
+
+	if (column_index > 0) {
+		size_t i = 0;
+		for (unsigned short path_id1 = 0; path_id1 < nr_prev_paths; ++path_id1) {
+			for (unsigned short path_id2 = 0; path_id2 < nr_prev_paths; ++path_id2) {
+				long double prev_forward = previous_column->at(i);
+				helper_i[path_id1] += prev_forward;
+				helper_j[path_id2] += prev_forward;
+				helper_ij += prev_forward;
+				i += 1;
+			}
+		}
+	}
+
 	// normalization
 	long double normalization_sum = 0.0L;
 
 	// state index
 	size_t i = 0;
-	unsigned short nr_prev_paths = 0;
-	if (column_index > 0) nr_prev_paths = previous_indexer->nr_paths();
 	// iterate over all pairs of current paths
 	for (unsigned short path_id1 = 0; path_id1 < nr_paths; ++path_id1) {
 		for (unsigned short path_id2 = 0; path_id2 < nr_paths; ++path_id2) {
-			// get paths corresponding to path indices
-			unsigned short path1 = column_indexer->get_path(path_id1);
-			unsigned short path2 = column_indexer->get_path(path_id2);
 			long double previous_cell = 0.0L;
 			if (column_index > 0) {
-				// previous state index
-				size_t j = 0;
-				// iterate over all pairs of previous paths
-				for (unsigned short prev_path_id1 = 0; prev_path_id1 < nr_prev_paths; ++prev_path_id1) {
-					for (unsigned short prev_path_id2 = 0; prev_path_id2 < nr_prev_paths; ++prev_path_id2) {
-						// forward probability of previous cell
-						long double prev_forward = previous_column->at(j);
-						// paths corresponding to path indices
-						unsigned short prev_path1 = previous_indexer->get_path(prev_path_id1);
-						unsigned short prev_path2 = previous_indexer->get_path(prev_path_id2);
-
-						// determine transition probability
-						long double transition_prob = transition_probability_computer->compute_transition_prob(prev_path1, prev_path2, path1, path2);
-						previous_cell += prev_forward * transition_prob;
-						j += 1;
-					}
-				}
+				previous_cell = transition_probability_computer->compute_transition_prob(0) * previous_column->at(i) +
+								transition_probability_computer->compute_transition_prob(1) * (helper_i[path_id1] + helper_j[path_id2] - 2*previous_column->at(i)) + 
+								transition_probability_computer->compute_transition_prob(2) * (helper_ij - helper_i[path_id1] - helper_j[path_id2] + previous_column->at(i));
 			} else {
 				previous_cell = 1.0L;
 			}
+
 			// determine alleles current paths (ids) correspond to
 			unsigned char allele1 = column_indexer->get_allele(path_id1);
 			unsigned char allele2 = column_indexer->get_allele(path_id2);
@@ -334,6 +342,31 @@ void HMM::compute_backward_column(size_t column_index) {
 		assert (forward_column != nullptr);
 	}
 
+	// pre-compute helper variables
+	unsigned short nr_prev_paths = 0;
+	if (column_index < column_count - 1) {
+		nr_prev_paths = previous_indexer->nr_paths();
+		assert (nr_prev_paths == nr_paths);
+	}
+
+	vector<long double> helper_i(nr_prev_paths);
+	vector<long double> helper_j(nr_prev_paths);
+	long double helper_ij = 0.0;
+
+	if (column_index < column_count - 1) {
+		size_t i = 0;
+		for (unsigned short path_id1 = 0; path_id1 < nr_prev_paths; ++path_id1) {
+			unsigned char prev_allele1 = previous_indexer->get_allele(path_id1);
+			for (unsigned short path_id2 = 0; path_id2 < nr_prev_paths; ++path_id2) {
+				unsigned char prev_allele2 = previous_indexer->get_allele(path_id2);
+				helper_i[path_id1] += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
+				helper_j[path_id2] += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
+				helper_ij += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
+				i += 1;
+			}
+		}
+	}
+
 	// construct new column
 	vector<long double>* current_column = new vector<long double>();
 
@@ -345,36 +378,22 @@ void HMM::compute_backward_column(size_t column_index) {
 
 	// state index
 	size_t i = 0;
-	unsigned short nr_prev_paths = 0;
-	if (column_index < column_count - 1) nr_prev_paths = previous_indexer->nr_paths();
 	// iterate over all pairs of current paths
 	for (unsigned short path_id1 = 0; path_id1 < nr_paths; ++path_id1) {
 		for (unsigned short path_id2 = 0; path_id2 < nr_paths; ++path_id2) {
-			// get paths corresponding to path indices
-			unsigned short path1 = column_indexer->get_path(path_id1);
-			unsigned short path2 = column_indexer->get_path(path_id2);
 			// get alleles on current paths
 			unsigned char allele1 = column_indexer->get_allele(path_id1);
 			unsigned char allele2 = column_indexer->get_allele(path_id2);
 			long double current_cell = 0.0L;
 			if (column_index < column_count - 1) {
+				// get alleles on previous paths, assuming indexes are same as current column
+				unsigned char prev_allele1 = previous_indexer->get_allele(path_id1);
+				unsigned char prev_allele2 = previous_indexer->get_allele(path_id2);
+				long double helper_cell = this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
 				// iterate over previous column (ahead of this)
-				size_t j = 0;
-				for (unsigned short prev_path_id1 = 0; prev_path_id1 < nr_prev_paths; ++prev_path_id1) {
-					for (unsigned short prev_path_id2 = 0; prev_path_id2 < nr_prev_paths; ++prev_path_id2) {
-						// paths corresponding to path indices
-						unsigned short prev_path1 = previous_indexer->get_path(prev_path_id1);
-						unsigned short prev_path2 = previous_indexer->get_path(prev_path_id2);
-						// alleles on previous paths
-						unsigned char prev_allele1 = previous_indexer->get_allele(prev_path_id1);
-						unsigned char prev_allele2 = previous_indexer->get_allele(prev_path_id2);
-						long double prev_backward = this->previous_backward_column->at(j);
-						// determine transition probability
-						long double transition_prob = transition_probability_computer->compute_transition_prob(path1, path2, prev_path1, prev_path2);
-						current_cell += prev_backward * transition_prob * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
-						j += 1;
-					}
-				}
+				current_cell =	transition_probability_computer->compute_transition_prob(0) * helper_cell +
+								transition_probability_computer->compute_transition_prob(1) * (helper_i[path_id1] + helper_j[path_id2] - 2*helper_cell) +
+								transition_probability_computer->compute_transition_prob(2) * (helper_ij - helper_i[path_id1] - helper_j[path_id2] + helper_cell);
 			} else {
 				current_cell = 1.0L;
 			}
@@ -392,6 +411,7 @@ void HMM::compute_backward_column(size_t column_index) {
 		}
 	}
 
+
 	if (normalization_sum > 0.0L) {
 		transform(current_column->begin(), current_column->end(), current_column->begin(), bind(divides<long double>(), placeholders::_1, normalization_sum));
 	} else {
@@ -425,6 +445,7 @@ void HMM::compute_backward_column(size_t column_index) {
 	}
 }
 
+
 void HMM::compute_viterbi_column(size_t column_index) {
 	assert(column_index < this->column_indexers.size());
 	size_t variant_id = this->column_indexers.at(column_index)->get_variant_id();
@@ -543,4 +564,4 @@ vector<GenotypingResult> HMM::get_genotyping_result() const {
 
 vector<GenotypingResult> HMM::move_genotyping_result() {
 	return move(this->genotyping_result);
-}
+}
diff --git a/src/transitionprobabilitycomputer.cpp b/src/transitionprobabilitycomputer.cpp
@@ -23,9 +23,17 @@ long double TransitionProbabilityComputer::compute_transition_prob(unsigned shor
 		return 1.0L;
 	} else {
 		// determine number of recombination events
-		unsigned int nr_events = 0;
-		if (path_id1 != path_id3) nr_events += 1;
-		if (path_id2 != path_id4) nr_events += 1;
-		return this->probabilities[nr_events];
+		unsigned int nr_switches = 0;
+		if (path_id1 != path_id3) nr_switches += 1;
+		if (path_id2 != path_id4) nr_switches += 1;
+		return this->probabilities[nr_switches];
+	}
+}
+
+long double TransitionProbabilityComputer::compute_transition_prob(unsigned short nr_switches) {
+	if (this->uniform) {
+		return 1.0L;
+	} else {
+		return this->probabilities[nr_switches];
 	}
 }
diff --git a/src/transitionprobabilitycomputer.hpp b/src/transitionprobabilitycomputer.hpp
@@ -10,7 +10,10 @@
 class TransitionProbabilityComputer {
 public:
 	TransitionProbabilityComputer(size_t from_variant, size_t to_variant, double recomb_rate, unsigned short nr_paths, bool uniform = false, long double effective_N = 25000.0L);
+	// computes the transition probability given previous and current paths
 	long double compute_transition_prob(unsigned short path_id1, unsigned short path_id2, unsigned short path_id3, unsigned short path_id4);
+	// computes the transition probability based on the number of haplotype switches
+	long double compute_transition_prob(unsigned short nr_switches);
 private:
 	std::vector<long double> probabilities;
 	bool uniform;

diff --git a/src/uniquekmercomputer.cpp b/src/uniquekmercomputer.cpp
@@ -52,18 +52,18 @@ void UniqueKmerComputer::compute_unique_kmers(vector<UniqueKmers*>* result, Prob
 
 		map <jellyfish::mer_dna, vector<unsigned char>> occurences;
 		const Variant& variant = this->variants->get_variant(this->chromosome, v);
-		UniqueKmers* u = new UniqueKmers(variant.get_start_position());
-		u->set_coverage(kmer_coverage);
-		size_t nr_alleles = variant.nr_of_alleles();
-
-		// insert empty alleles (to also capture paths for which no unique kmers exist)
+
+		vector<unsigned char> path_to_alleles;
 		assert(variant.nr_of_paths() < 65535);
 		for (unsigned short p = 0; p < variant.nr_of_paths(); ++p) {
 			unsigned char a = variant.get_allele_on_path(p);
-			u->insert_empty_allele(a);
-			u->insert_path(p,a);
+			path_to_alleles.push_back(a);
 		}
 
+		UniqueKmers* u = new UniqueKmers(variant.get_start_position(), path_to_alleles);
+		u->set_coverage(kmer_coverage);
+		size_t nr_alleles = variant.nr_of_alleles();
+
 		for (unsigned char a = 0; a < nr_alleles; ++a) {
 			// consider all alleles not undefined
 			if (variant.is_undefined_allele(a)) {
@@ -131,16 +131,13 @@ void UniqueKmerComputer::compute_empty(vector<UniqueKmers*>* result) const {
 	size_t nr_variants = this->variants->size_of(this->chromosome);
 	for (size_t v = 0; v < nr_variants; ++v) {
 		const Variant& variant = this->variants->get_variant(this->chromosome, v);
-		UniqueKmers* u = new UniqueKmers(variant.get_start_position());
-		size_t nr_alleles = variant.nr_of_alleles();
-
-		// insert empty alleles and paths
+		vector<unsigned char> path_to_alleles;
 		assert(variant.nr_of_paths() < 65535);
 		for (unsigned short p = 0; p < variant.nr_of_paths(); ++p) {
 			unsigned char a = variant.get_allele_on_path(p);
-			u->insert_empty_allele(a);
-			u->insert_path(p,a);
+			path_to_alleles.push_back(a);
 		}
+		UniqueKmers* u = new UniqueKmers(variant.get_start_position(), path_to_alleles);
 		result->push_back(u);
 	}
 }

diff --git a/src/uniquekmers.cpp b/src/uniquekmers.cpp
@@ -4,24 +4,23 @@
 
 using namespace std;
 
-UniqueKmers::UniqueKmers(size_t variant_position)
+UniqueKmers::UniqueKmers(size_t variant_position, vector<unsigned char>& alleles)
 	:variant_pos(variant_position),
 	 current_index(0),
-	 local_coverage(0)
-{}
+	 local_coverage(0),
+	 path_to_allele(alleles.size())
+{
+	for (size_t i = 0; i < alleles.size(); ++i) {
+		unsigned char a = alleles[i];
+		this->path_to_allele[i] = a;
+		this->alleles[a] = make_pair(KmerPath(), false);
+	}
+}
 
 size_t UniqueKmers::get_variant_position() {
 	return this->variant_pos;
 }
 
-void UniqueKmers::insert_empty_allele(unsigned char allele_id, bool is_undefined) {
-	this->alleles[allele_id] = make_pair(KmerPath(), is_undefined); 
-}
-
-void UniqueKmers::insert_path(unsigned short path_id, unsigned char allele_id) {
-	this->path_to_allele[path_id] = allele_id;
-}
-
 void UniqueKmers::insert_kmer(unsigned short readcount,  vector<unsigned char>& alleles){
 	size_t index = this->current_index;
 	this->kmer_to_count.push_back(readcount);
@@ -33,9 +32,10 @@ void UniqueKmers::insert_kmer(unsigned short readcount,  vector<unsigned char>&
 
 bool UniqueKmers::kmer_on_path(size_t kmer_index, size_t path_index) const {
 	// check if path_id exists
-	if (this->path_to_allele.find(path_index) == this->path_to_allele.end()) {
+	if (path_index >= this->path_to_allele.size()) {
 		throw runtime_error("UniqueKmers::kmer_on_path: path_index " + to_string(path_index) + " does not exist.");
 	}
+
 	// check if kmer_index is valid and look up position
 	if (kmer_index < this->current_index) {
 		unsigned char allele_id = this->path_to_allele.at(path_index);
@@ -66,17 +66,16 @@ void UniqueKmers::get_path_ids(vector<unsigned short>& p, vector<unsigned char>&
 		// only return paths that are also contained in only_include
 		for (auto p_it = only_include->begin(); p_it != only_include->end(); ++p_it) {
 			// check if path is in path_to_allele
-			auto it = this->path_to_allele.find(*p_it);
-			if (it != this->path_to_allele.end()) {
+			if (*p_it < this->path_to_allele.size()) {
 				p.push_back(*p_it);
-				a.push_back(it->second);
+				a.push_back(this->path_to_allele.at(*p_it));
 			}
 		}
 	} else {
 		// return all paths and corresponding alleles
-		for (auto it = this->path_to_allele.begin(); it != this->path_to_allele.end(); ++it) {
-			p.push_back(it->first);
-			a.push_back(it->second);
+		for (size_t i = 0; i < this->path_to_allele.size(); ++i) {
+			p.push_back(i);
+			a.push_back(this->path_to_allele.at(i));
 		}
 	}
 }
@@ -109,8 +108,8 @@ ostream& operator<< (ostream& stream, const UniqueKmers& uk) {
 	}
 
 	stream << "paths:" << endl;
-	for (auto it = uk.path_to_allele.begin(); it != uk.path_to_allele.end(); ++it) {
-		stream << it->first << " covers allele " << (unsigned int) it->second << endl;
+	for (size_t  i = 0; i < uk.path_to_allele.size(); ++i) {
+		stream << i << " covers allele " << (unsigned int) uk.path_to_allele.at(i) << endl;
 	}
 	return stream;
 }