Skip to content

Commit

Permalink
introduce msacontainer object, overhaul main alignment routine
Browse files Browse the repository at this point in the history
  • Loading branch information
gamcil committed Sep 13, 2024
1 parent 3219c87 commit d695073
Show file tree
Hide file tree
Showing 10 changed files with 676 additions and 460 deletions.
2 changes: 2 additions & 0 deletions src/commons/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ set(commons_source_files
commons/StructureSmithWaterman.h
commons/newick.cpp
commons/newick.h
commons/MSA.cpp
commons/MSA.h
PARENT_SCOPE)
126 changes: 126 additions & 0 deletions src/commons/MSA.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#include "MSA.h"

SubMSA::SubMSA() {}
SubMSA::SubMSA(size_t a) : id(a), members({ a }) {}
SubMSA::SubMSA(size_t a, size_t b) : members({ a, b }) {}

void SubMSA::pushMember(size_t other) {
members.insert(members.end(), other);
}
void SubMSA::update(const SubMSA &other) {
members.clear();
members.assign(other.members.begin(), other.members.end());
profile_aa = other.profile_aa;
profile_ss = other.profile_ss;
mask = other.mask;
}
void SubMSA::concat(const SubMSA &other) {
members.insert(members.end(), other.members.begin(), other.members.end());
}
void SubMSA::concat(const std::vector<size_t> &other) {
members.insert(members.end(), other.begin(), other.end());
}

MSAContainer::MSAContainer() {}
MSAContainer::MSAContainer(size_t n) : dbKeys(n), dbIdToSubMSAVec(n, n), cigars_aa(n), cigars_ss(n) {}

std::vector<SubMSA>::iterator MSAContainer::begin() {
return data.begin();
}
std::vector<SubMSA>::iterator MSAContainer::end() {
return data.end();
}
std::vector<SubMSA>::const_iterator MSAContainer::begin() const {
return data.begin();
}
std::vector<SubMSA>::const_iterator MSAContainer::end() const {
return data.end();
}

SubMSA& MSAContainer::operator[](size_t index) {
return data[index];
}

SubMSA& MSAContainer::operator[](const std::vector<SubMSA>::iterator& it) {
return *it;
}

SubMSA& MSAContainer::back() {
return data.back();
}

size_t MSAContainer::size() const {
return data.size();
}

void MSAContainer::add(size_t index) {
data.emplace_back(index);
dbIdToSubMSAVec[index] = data.size() - 1;
}

void MSAContainer::add(const SubMSA &msa) {
data.push_back(msa);
for (size_t i = 0; i < msa.members.size(); i++) {
dbIdToSubMSAVec[msa.members[i]] = data.size() - 1;
}
}

void MSAContainer::remove(std::vector<size_t> &toRemove) {
std::sort(toRemove.begin(), toRemove.end(), std::greater<int>());
for (size_t index : toRemove) {
data.erase(data.begin() + index);
}
for (size_t i = 0; i < data.size(); i++) {
const SubMSA &msa = data[i];
for (size_t member : msa.members) {
dbIdToSubMSAVec[member] = i;
}
}
}

void MSAContainer::addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss) {
for (size_t j = 0; j < length; j++) {
cigars_aa[id].emplace_back(aa[j]);
cigars_ss[id].emplace_back(ss[j]);
}
dbKeys[id] = key;
}


// Merge SubMSA of db id b into SubMSA of db id a
// Returns index of updated SubMSA of db id a
size_t MSAContainer::mergeInto(size_t a, size_t b) {
size_t aIdx = dbIdToSubMSAVec[a];
size_t bIdx = dbIdToSubMSAVec[b];
if (bIdx == cigars_aa.size()) {
// b isn't a profile
data[aIdx].pushMember(b);
dbIdToSubMSAVec[b] = aIdx;
} else {
data[aIdx].concat(data[bIdx]);
for (size_t i = 0; i < data[bIdx].members.size(); i++) {
size_t member = data[bIdx].members[i];
dbIdToSubMSAVec[member] = aIdx;
}
}
return aIdx;
}

// Update the container with newly created SubMSAs, remove stale ones
// 1: new submsa --> add to msa
// 2: one profile, one structure -> add to profile
// 3: both profile -> merge into query
// cases 2 and 3 always identical since profile always made query
void MSAContainer::update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove) {
for (const SubMSA &sub : newMSAs) {
add(sub);
}
if (toRemove.size() > 0) {
remove(toRemove);
}
}


bool MSAContainer::isProfile(size_t index) {
return (dbIdToSubMSAVec[index] != cigars_aa.size());
}
96 changes: 96 additions & 0 deletions src/commons/MSA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#ifndef MSA_H
#define MSA_H

#include <vector>
#include <string>
#include <map>
#include <numeric>
#include <cstdint>
#include <algorithm>

// Bit field version
// First bit = match or gap
// Remaining bits = ASCII character or (gap) count
union Instruction {
struct BitFields {
std::uint8_t state : 1; // 0 = match, 1 = gap
std::uint8_t count : 7; // count < 127
} bits;
unsigned char data;
Instruction() {
data = 0;
}
Instruction(int state, int count) {
data = 0;
bits.state = static_cast<std::uint8_t>(state);
bits.count = static_cast<std::uint8_t>(count);
}
Instruction(char c) {
data = 0;
bits.state = static_cast<std::uint8_t>(0);
bits.count = static_cast<std::uint8_t>(c);
}
Instruction(int count) {
data = 0;
bits.state = static_cast<std::uint8_t>(1);
bits.count = static_cast<std::uint8_t>(count);
}
char getCharacter() const {
return (bits.state == 0) ? static_cast<char>(bits.count) : '-';
}
bool isSeq() const {
return (bits.state == 0);
}
bool isFull() const {
return (bits.count == 127);
}
};

struct SubMSA {
size_t id; // Database ID of 'merged' representative
std::vector<size_t> members; // Database IDs of member structures
std::string profile_aa; // Amino acid profile
std::string profile_ss; // 3Di profile
std::string mask; // Profile mask string
SubMSA();
SubMSA(size_t a);
SubMSA(size_t a, size_t b);
void pushMember(size_t other);
void update(const SubMSA &other);
void concat(const SubMSA &other);
void concat(const std::vector<size_t> &other);
};


class MSAContainer {
private:
std::vector<SubMSA> data;

public:
std::vector<size_t> dbKeys;
std::vector<size_t> dbIdToSubMSAVec;
std::vector<std::vector<Instruction> > cigars_aa;
std::vector<std::vector<Instruction> > cigars_ss;

MSAContainer();
MSAContainer(size_t n);

std::vector<SubMSA>::iterator begin();
std::vector<SubMSA>::iterator end();
std::vector<SubMSA>::const_iterator begin() const;
std::vector<SubMSA>::const_iterator end() const;

SubMSA& operator[](size_t index);
SubMSA& operator[](const std::vector<SubMSA>::iterator& it);
SubMSA& back();
size_t size() const;
void add(size_t index);
void add(const SubMSA &msa);
void remove(std::vector<size_t> &toRemove);
void addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss);
size_t mergeInto(size_t a, size_t b);
void update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove);
bool isProfile(size_t index);
};

#endif
3 changes: 2 additions & 1 deletion src/commons/StructureSmithWaterman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh(
// Adjust CIGAR string to start/end on M
// q/dbStart and q/dbEnd are already correct, no need to adjust here
// q/dbStart set to last M j/i, q/dbEnd last M .ref/.read
size_t alnLength = cigar.length();
trimCIGAR(cigar, qEnd, dbEnd);

delete[] workspace;
Expand All @@ -889,7 +890,7 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh(
0, // align.tCov,
0, // seqId
0, // align.evalue,
0, // alnLength
alnLength, // alnLength
qStart,
qEnd,
query_end,
Expand Down
Loading

0 comments on commit d695073

Please sign in to comment.