Skip to content

Commit

Permalink
Merge branch 'quadratic-runtime'
Browse files Browse the repository at this point in the history
  • Loading branch information
eblerjana committed Jan 19, 2023
2 parents c48a7b6 + 5ea3d77 commit 8674d3a
Show file tree
Hide file tree
Showing 13 changed files with 282 additions and 533 deletions.
12 changes: 4 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ A short-read genotyper for various types of genetic variants (such as SNPs, inde
## Installation


### Building from source using Singularity
### Building from source using Singularity (recommended)

Use the Singularity definition file located in ``container/`` to build an (Ubuntu-based) container as follows (requires root privileges):

Expand Down Expand Up @@ -127,16 +127,12 @@ options:

## Runtime and memory usage

Runtime and memory usage depend on the number of variants genotyped and the number of haplotypes present in the graph.
Runtime and memory usage depend on the number of variants genotyped and the number of haplotypes present in the graph. PanGenie is fastest when it is installed using Singularity (see above).

With the data described here: https://doi.org/10.1038/s41588-022-01043-w, PanGenie ran in 1 hour and 25 minutes walltime using 22 cores (16 CPU hours) and used 68 GB RAM.
The largest dataset that we have tested contained around 16M variants, 64 haplotypes and around 30x read coverage. Using 24 cores, PanGenie run in 1 hour and 46 minutes (24 CPU hours) and used 120 GB of RAM.
With the data described here: https://doi.org/10.1038/s41588-022-01043-w, PanGenie ran in 1 hour and 5 minutes walltime using 24 cores (16 CPU hours) and used 68 GB RAM.
The largest dataset that we have tested (HPRC: https://doi.org/10.1101/2022.07.09.499321) contained around 27 million variants, 88 haplotypes and around 30x read coverage. Using 24 cores, PanGenie run in 1 hour and 57 minutes (19 CPU hours) and used 86 GB of RAM.


## Notes

The largest panel we have run PanGenie on so far consisted of 44 samples (88 haplotypes). On this data, PanGenie needed 53 CPU hours (03:15 h wallclock time using 24 cores) and 153 GB of memory in order to genotype 20,661,169 variants.

## Limitations

The runtime of PanGenie gets slow as the number of haplotype paths increases. Due to technical reasons, the current implementation of PanGenie cannot handle more than 254 input haplotypes (127 diploid samples).
Expand Down
111 changes: 66 additions & 45 deletions src/hmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ void HMM::compute_viterbi_path() {
}

void HMM::compute_forward_column(size_t column_index) {
// NOTE: this implementation assumes that all variant positions are covered by the same set of paths

assert(column_index < this->column_indexers.size());
size_t variant_id = this->column_indexers.at(column_index)->get_variant_id();

Expand All @@ -212,7 +214,6 @@ void HMM::compute_forward_column(size_t column_index) {
size_t prev_pos = this->unique_kmers->at(prev_index)->get_variant_position();
size_t cur_pos = this->unique_kmers->at(cur_index)->get_variant_position();
transition_probability_computer = new TransitionProbabilityComputer(prev_pos, cur_pos, this->recombrate, nr_paths, this->uniform, this->effective_N);

}

// construct new column
Expand All @@ -221,41 +222,48 @@ void HMM::compute_forward_column(size_t column_index) {
// emission probability computer
EmissionProbabilityComputer emission_probability_computer(this->unique_kmers->at(variant_id), this->probabilities);

// pre-compute helper variables
unsigned short nr_prev_paths = 0;
if (column_index > 0) {
nr_prev_paths = previous_indexer->nr_paths();
// the assumption is that all variants are covered by the same set of paths, therefore these numbers must be equal
assert(nr_prev_paths == nr_paths);
}

vector<long double> helper_i(nr_prev_paths);
vector<long double> helper_j(nr_prev_paths);
long double helper_ij = 0.0;

if (column_index > 0) {
size_t i = 0;
for (unsigned short path_id1 = 0; path_id1 < nr_prev_paths; ++path_id1) {
for (unsigned short path_id2 = 0; path_id2 < nr_prev_paths; ++path_id2) {
long double prev_forward = previous_column->at(i);
helper_i[path_id1] += prev_forward;
helper_j[path_id2] += prev_forward;
helper_ij += prev_forward;
i += 1;
}
}
}

// normalization
long double normalization_sum = 0.0L;

// state index
size_t i = 0;
unsigned short nr_prev_paths = 0;
if (column_index > 0) nr_prev_paths = previous_indexer->nr_paths();
// iterate over all pairs of current paths
for (unsigned short path_id1 = 0; path_id1 < nr_paths; ++path_id1) {
for (unsigned short path_id2 = 0; path_id2 < nr_paths; ++path_id2) {
// get paths corresponding to path indices
unsigned short path1 = column_indexer->get_path(path_id1);
unsigned short path2 = column_indexer->get_path(path_id2);
long double previous_cell = 0.0L;
if (column_index > 0) {
// previous state index
size_t j = 0;
// iterate over all pairs of previous paths
for (unsigned short prev_path_id1 = 0; prev_path_id1 < nr_prev_paths; ++prev_path_id1) {
for (unsigned short prev_path_id2 = 0; prev_path_id2 < nr_prev_paths; ++prev_path_id2) {
// forward probability of previous cell
long double prev_forward = previous_column->at(j);
// paths corresponding to path indices
unsigned short prev_path1 = previous_indexer->get_path(prev_path_id1);
unsigned short prev_path2 = previous_indexer->get_path(prev_path_id2);

// determine transition probability
long double transition_prob = transition_probability_computer->compute_transition_prob(prev_path1, prev_path2, path1, path2);
previous_cell += prev_forward * transition_prob;
j += 1;
}
}
previous_cell = transition_probability_computer->compute_transition_prob(0) * previous_column->at(i) +
transition_probability_computer->compute_transition_prob(1) * (helper_i[path_id1] + helper_j[path_id2] - 2*previous_column->at(i)) +
transition_probability_computer->compute_transition_prob(2) * (helper_ij - helper_i[path_id1] - helper_j[path_id2] + previous_column->at(i));
} else {
previous_cell = 1.0L;
}

// determine alleles current paths (ids) correspond to
unsigned char allele1 = column_indexer->get_allele(path_id1);
unsigned char allele2 = column_indexer->get_allele(path_id2);
Expand Down Expand Up @@ -334,6 +342,31 @@ void HMM::compute_backward_column(size_t column_index) {
assert (forward_column != nullptr);
}

// pre-compute helper variables
unsigned short nr_prev_paths = 0;
if (column_index < column_count - 1) {
nr_prev_paths = previous_indexer->nr_paths();
assert (nr_prev_paths == nr_paths);
}

vector<long double> helper_i(nr_prev_paths);
vector<long double> helper_j(nr_prev_paths);
long double helper_ij = 0.0;

if (column_index < column_count - 1) {
size_t i = 0;
for (unsigned short path_id1 = 0; path_id1 < nr_prev_paths; ++path_id1) {
unsigned char prev_allele1 = previous_indexer->get_allele(path_id1);
for (unsigned short path_id2 = 0; path_id2 < nr_prev_paths; ++path_id2) {
unsigned char prev_allele2 = previous_indexer->get_allele(path_id2);
helper_i[path_id1] += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
helper_j[path_id2] += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
helper_ij += this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
i += 1;
}
}
}

// construct new column
vector<long double>* current_column = new vector<long double>();

Expand All @@ -345,36 +378,22 @@ void HMM::compute_backward_column(size_t column_index) {

// state index
size_t i = 0;
unsigned short nr_prev_paths = 0;
if (column_index < column_count - 1) nr_prev_paths = previous_indexer->nr_paths();
// iterate over all pairs of current paths
for (unsigned short path_id1 = 0; path_id1 < nr_paths; ++path_id1) {
for (unsigned short path_id2 = 0; path_id2 < nr_paths; ++path_id2) {
// get paths corresponding to path indices
unsigned short path1 = column_indexer->get_path(path_id1);
unsigned short path2 = column_indexer->get_path(path_id2);
// get alleles on current paths
unsigned char allele1 = column_indexer->get_allele(path_id1);
unsigned char allele2 = column_indexer->get_allele(path_id2);
long double current_cell = 0.0L;
if (column_index < column_count - 1) {
// get alleles on previous paths, assuming indexes are same as current column
unsigned char prev_allele1 = previous_indexer->get_allele(path_id1);
unsigned char prev_allele2 = previous_indexer->get_allele(path_id2);
long double helper_cell = this->previous_backward_column->at(i) * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
// iterate over previous column (ahead of this)
size_t j = 0;
for (unsigned short prev_path_id1 = 0; prev_path_id1 < nr_prev_paths; ++prev_path_id1) {
for (unsigned short prev_path_id2 = 0; prev_path_id2 < nr_prev_paths; ++prev_path_id2) {
// paths corresponding to path indices
unsigned short prev_path1 = previous_indexer->get_path(prev_path_id1);
unsigned short prev_path2 = previous_indexer->get_path(prev_path_id2);
// alleles on previous paths
unsigned char prev_allele1 = previous_indexer->get_allele(prev_path_id1);
unsigned char prev_allele2 = previous_indexer->get_allele(prev_path_id2);
long double prev_backward = this->previous_backward_column->at(j);
// determine transition probability
long double transition_prob = transition_probability_computer->compute_transition_prob(path1, path2, prev_path1, prev_path2);
current_cell += prev_backward * transition_prob * emission_probability_computer->get_emission_probability(prev_allele1, prev_allele2);
j += 1;
}
}
current_cell = transition_probability_computer->compute_transition_prob(0) * helper_cell +
transition_probability_computer->compute_transition_prob(1) * (helper_i[path_id1] + helper_j[path_id2] - 2*helper_cell) +
transition_probability_computer->compute_transition_prob(2) * (helper_ij - helper_i[path_id1] - helper_j[path_id2] + helper_cell);
} else {
current_cell = 1.0L;
}
Expand All @@ -392,6 +411,7 @@ void HMM::compute_backward_column(size_t column_index) {
}
}


if (normalization_sum > 0.0L) {
transform(current_column->begin(), current_column->end(), current_column->begin(), bind(divides<long double>(), placeholders::_1, normalization_sum));
} else {
Expand Down Expand Up @@ -425,6 +445,7 @@ void HMM::compute_backward_column(size_t column_index) {
}
}


void HMM::compute_viterbi_column(size_t column_index) {
assert(column_index < this->column_indexers.size());
size_t variant_id = this->column_indexers.at(column_index)->get_variant_id();
Expand Down Expand Up @@ -543,4 +564,4 @@ vector<GenotypingResult> HMM::get_genotyping_result() const {

vector<GenotypingResult> HMM::move_genotyping_result() {
return move(this->genotyping_result);
}
}
16 changes: 12 additions & 4 deletions src/transitionprobabilitycomputer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,17 @@ long double TransitionProbabilityComputer::compute_transition_prob(unsigned shor
return 1.0L;
} else {
// determine number of recombination events
unsigned int nr_events = 0;
if (path_id1 != path_id3) nr_events += 1;
if (path_id2 != path_id4) nr_events += 1;
return this->probabilities[nr_events];
unsigned int nr_switches = 0;
if (path_id1 != path_id3) nr_switches += 1;
if (path_id2 != path_id4) nr_switches += 1;
return this->probabilities[nr_switches];
}
}

long double TransitionProbabilityComputer::compute_transition_prob(unsigned short nr_switches) {
if (this->uniform) {
return 1.0L;
} else {
return this->probabilities[nr_switches];
}
}
3 changes: 3 additions & 0 deletions src/transitionprobabilitycomputer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
class TransitionProbabilityComputer {
public:
TransitionProbabilityComputer(size_t from_variant, size_t to_variant, double recomb_rate, unsigned short nr_paths, bool uniform = false, long double effective_N = 25000.0L);
// computes the transition probability given previous and current paths
long double compute_transition_prob(unsigned short path_id1, unsigned short path_id2, unsigned short path_id3, unsigned short path_id4);
// computes the transition probability based on the number of haplotype switches
long double compute_transition_prob(unsigned short nr_switches);
private:
std::vector<long double> probabilities;
bool uniform;
Expand Down
23 changes: 10 additions & 13 deletions src/uniquekmercomputer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,18 @@ void UniqueKmerComputer::compute_unique_kmers(vector<UniqueKmers*>* result, Prob

map <jellyfish::mer_dna, vector<unsigned char>> occurences;
const Variant& variant = this->variants->get_variant(this->chromosome, v);
UniqueKmers* u = new UniqueKmers(variant.get_start_position());
u->set_coverage(kmer_coverage);
size_t nr_alleles = variant.nr_of_alleles();

// insert empty alleles (to also capture paths for which no unique kmers exist)

vector<unsigned char> path_to_alleles;
assert(variant.nr_of_paths() < 65535);
for (unsigned short p = 0; p < variant.nr_of_paths(); ++p) {
unsigned char a = variant.get_allele_on_path(p);
u->insert_empty_allele(a);
u->insert_path(p,a);
path_to_alleles.push_back(a);
}

UniqueKmers* u = new UniqueKmers(variant.get_start_position(), path_to_alleles);
u->set_coverage(kmer_coverage);
size_t nr_alleles = variant.nr_of_alleles();

for (unsigned char a = 0; a < nr_alleles; ++a) {
// consider all alleles not undefined
if (variant.is_undefined_allele(a)) {
Expand Down Expand Up @@ -131,16 +131,13 @@ void UniqueKmerComputer::compute_empty(vector<UniqueKmers*>* result) const {
size_t nr_variants = this->variants->size_of(this->chromosome);
for (size_t v = 0; v < nr_variants; ++v) {
const Variant& variant = this->variants->get_variant(this->chromosome, v);
UniqueKmers* u = new UniqueKmers(variant.get_start_position());
size_t nr_alleles = variant.nr_of_alleles();

// insert empty alleles and paths
vector<unsigned char> path_to_alleles;
assert(variant.nr_of_paths() < 65535);
for (unsigned short p = 0; p < variant.nr_of_paths(); ++p) {
unsigned char a = variant.get_allele_on_path(p);
u->insert_empty_allele(a);
u->insert_path(p,a);
path_to_alleles.push_back(a);
}
UniqueKmers* u = new UniqueKmers(variant.get_start_position(), path_to_alleles);
result->push_back(u);
}
}
Expand Down
39 changes: 19 additions & 20 deletions src/uniquekmers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@

using namespace std;

UniqueKmers::UniqueKmers(size_t variant_position)
UniqueKmers::UniqueKmers(size_t variant_position, vector<unsigned char>& alleles)
:variant_pos(variant_position),
current_index(0),
local_coverage(0)
{}
local_coverage(0),
path_to_allele(alleles.size())
{
for (size_t i = 0; i < alleles.size(); ++i) {
unsigned char a = alleles[i];
this->path_to_allele[i] = a;
this->alleles[a] = make_pair(KmerPath(), false);
}
}

size_t UniqueKmers::get_variant_position() {
return this->variant_pos;
}

void UniqueKmers::insert_empty_allele(unsigned char allele_id, bool is_undefined) {
this->alleles[allele_id] = make_pair(KmerPath(), is_undefined);
}

void UniqueKmers::insert_path(unsigned short path_id, unsigned char allele_id) {
this->path_to_allele[path_id] = allele_id;
}

void UniqueKmers::insert_kmer(unsigned short readcount, vector<unsigned char>& alleles){
size_t index = this->current_index;
this->kmer_to_count.push_back(readcount);
Expand All @@ -33,9 +32,10 @@ void UniqueKmers::insert_kmer(unsigned short readcount, vector<unsigned char>&

bool UniqueKmers::kmer_on_path(size_t kmer_index, size_t path_index) const {
// check if path_id exists
if (this->path_to_allele.find(path_index) == this->path_to_allele.end()) {
if (path_index >= this->path_to_allele.size()) {
throw runtime_error("UniqueKmers::kmer_on_path: path_index " + to_string(path_index) + " does not exist.");
}

// check if kmer_index is valid and look up position
if (kmer_index < this->current_index) {
unsigned char allele_id = this->path_to_allele.at(path_index);
Expand Down Expand Up @@ -66,17 +66,16 @@ void UniqueKmers::get_path_ids(vector<unsigned short>& p, vector<unsigned char>&
// only return paths that are also contained in only_include
for (auto p_it = only_include->begin(); p_it != only_include->end(); ++p_it) {
// check if path is in path_to_allele
auto it = this->path_to_allele.find(*p_it);
if (it != this->path_to_allele.end()) {
if (*p_it < this->path_to_allele.size()) {
p.push_back(*p_it);
a.push_back(it->second);
a.push_back(this->path_to_allele.at(*p_it));
}
}
} else {
// return all paths and corresponding alleles
for (auto it = this->path_to_allele.begin(); it != this->path_to_allele.end(); ++it) {
p.push_back(it->first);
a.push_back(it->second);
for (size_t i = 0; i < this->path_to_allele.size(); ++i) {
p.push_back(i);
a.push_back(this->path_to_allele.at(i));
}
}
}
Expand Down Expand Up @@ -109,8 +108,8 @@ ostream& operator<< (ostream& stream, const UniqueKmers& uk) {
}

stream << "paths:" << endl;
for (auto it = uk.path_to_allele.begin(); it != uk.path_to_allele.end(); ++it) {
stream << it->first << " covers allele " << (unsigned int) it->second << endl;
for (size_t i = 0; i < uk.path_to_allele.size(); ++i) {
stream << i << " covers allele " << (unsigned int) uk.path_to_allele.at(i) << endl;
}
return stream;
}
Expand Down
Loading

0 comments on commit 8674d3a

Please sign in to comment.