-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
introduce msacontainer object, overhaul main alignment routine
- Loading branch information
Showing
10 changed files
with
676 additions
and
460 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#include "MSA.h" | ||
|
||
SubMSA::SubMSA() {} | ||
SubMSA::SubMSA(size_t a) : id(a), members({ a }) {} | ||
SubMSA::SubMSA(size_t a, size_t b) : members({ a, b }) {} | ||
|
||
void SubMSA::pushMember(size_t other) { | ||
members.insert(members.end(), other); | ||
} | ||
void SubMSA::update(const SubMSA &other) { | ||
members.clear(); | ||
members.assign(other.members.begin(), other.members.end()); | ||
profile_aa = other.profile_aa; | ||
profile_ss = other.profile_ss; | ||
mask = other.mask; | ||
} | ||
void SubMSA::concat(const SubMSA &other) { | ||
members.insert(members.end(), other.members.begin(), other.members.end()); | ||
} | ||
void SubMSA::concat(const std::vector<size_t> &other) { | ||
members.insert(members.end(), other.begin(), other.end()); | ||
} | ||
|
||
MSAContainer::MSAContainer() {} | ||
MSAContainer::MSAContainer(size_t n) : dbKeys(n), dbIdToSubMSAVec(n, n), cigars_aa(n), cigars_ss(n) {} | ||
|
||
std::vector<SubMSA>::iterator MSAContainer::begin() { | ||
return data.begin(); | ||
} | ||
std::vector<SubMSA>::iterator MSAContainer::end() { | ||
return data.end(); | ||
} | ||
std::vector<SubMSA>::const_iterator MSAContainer::begin() const { | ||
return data.begin(); | ||
} | ||
std::vector<SubMSA>::const_iterator MSAContainer::end() const { | ||
return data.end(); | ||
} | ||
|
||
SubMSA& MSAContainer::operator[](size_t index) { | ||
return data[index]; | ||
} | ||
|
||
SubMSA& MSAContainer::operator[](const std::vector<SubMSA>::iterator& it) { | ||
return *it; | ||
} | ||
|
||
SubMSA& MSAContainer::back() { | ||
return data.back(); | ||
} | ||
|
||
size_t MSAContainer::size() const { | ||
return data.size(); | ||
} | ||
|
||
void MSAContainer::add(size_t index) { | ||
data.emplace_back(index); | ||
dbIdToSubMSAVec[index] = data.size() - 1; | ||
} | ||
|
||
void MSAContainer::add(const SubMSA &msa) { | ||
data.push_back(msa); | ||
for (size_t i = 0; i < msa.members.size(); i++) { | ||
dbIdToSubMSAVec[msa.members[i]] = data.size() - 1; | ||
} | ||
} | ||
|
||
void MSAContainer::remove(std::vector<size_t> &toRemove) { | ||
std::sort(toRemove.begin(), toRemove.end(), std::greater<int>()); | ||
for (size_t index : toRemove) { | ||
data.erase(data.begin() + index); | ||
} | ||
for (size_t i = 0; i < data.size(); i++) { | ||
const SubMSA &msa = data[i]; | ||
for (size_t member : msa.members) { | ||
dbIdToSubMSAVec[member] = i; | ||
} | ||
} | ||
} | ||
|
||
void MSAContainer::addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss) { | ||
for (size_t j = 0; j < length; j++) { | ||
cigars_aa[id].emplace_back(aa[j]); | ||
cigars_ss[id].emplace_back(ss[j]); | ||
} | ||
dbKeys[id] = key; | ||
} | ||
|
||
|
||
// Merge SubMSA of db id b into SubMSA of db id a | ||
// Returns index of updated SubMSA of db id a | ||
size_t MSAContainer::mergeInto(size_t a, size_t b) { | ||
size_t aIdx = dbIdToSubMSAVec[a]; | ||
size_t bIdx = dbIdToSubMSAVec[b]; | ||
if (bIdx == cigars_aa.size()) { | ||
// b isn't a profile | ||
data[aIdx].pushMember(b); | ||
dbIdToSubMSAVec[b] = aIdx; | ||
} else { | ||
data[aIdx].concat(data[bIdx]); | ||
for (size_t i = 0; i < data[bIdx].members.size(); i++) { | ||
size_t member = data[bIdx].members[i]; | ||
dbIdToSubMSAVec[member] = aIdx; | ||
} | ||
} | ||
return aIdx; | ||
} | ||
|
||
// Update the container with newly created SubMSAs, remove stale ones | ||
// 1: new submsa --> add to msa | ||
// 2: one profile, one structure -> add to profile | ||
// 3: both profile -> merge into query | ||
// cases 2 and 3 always identical since profile always made query | ||
void MSAContainer::update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove) { | ||
for (const SubMSA &sub : newMSAs) { | ||
add(sub); | ||
} | ||
if (toRemove.size() > 0) { | ||
remove(toRemove); | ||
} | ||
} | ||
|
||
|
||
bool MSAContainer::isProfile(size_t index) { | ||
return (dbIdToSubMSAVec[index] != cigars_aa.size()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#ifndef MSA_H | ||
#define MSA_H | ||
|
||
#include <vector> | ||
#include <string> | ||
#include <map> | ||
#include <numeric> | ||
#include <cstdint> | ||
#include <algorithm> | ||
|
||
// Bit field version | ||
// First bit = match or gap | ||
// Remaining bits = ASCII character or (gap) count | ||
union Instruction { | ||
struct BitFields { | ||
std::uint8_t state : 1; // 0 = match, 1 = gap | ||
std::uint8_t count : 7; // count < 127 | ||
} bits; | ||
unsigned char data; | ||
Instruction() { | ||
data = 0; | ||
} | ||
Instruction(int state, int count) { | ||
data = 0; | ||
bits.state = static_cast<std::uint8_t>(state); | ||
bits.count = static_cast<std::uint8_t>(count); | ||
} | ||
Instruction(char c) { | ||
data = 0; | ||
bits.state = static_cast<std::uint8_t>(0); | ||
bits.count = static_cast<std::uint8_t>(c); | ||
} | ||
Instruction(int count) { | ||
data = 0; | ||
bits.state = static_cast<std::uint8_t>(1); | ||
bits.count = static_cast<std::uint8_t>(count); | ||
} | ||
char getCharacter() const { | ||
return (bits.state == 0) ? static_cast<char>(bits.count) : '-'; | ||
} | ||
bool isSeq() const { | ||
return (bits.state == 0); | ||
} | ||
bool isFull() const { | ||
return (bits.count == 127); | ||
} | ||
}; | ||
|
||
struct SubMSA { | ||
size_t id; // Database ID of 'merged' representative | ||
std::vector<size_t> members; // Database IDs of member structures | ||
std::string profile_aa; // Amino acid profile | ||
std::string profile_ss; // 3Di profile | ||
std::string mask; // Profile mask string | ||
SubMSA(); | ||
SubMSA(size_t a); | ||
SubMSA(size_t a, size_t b); | ||
void pushMember(size_t other); | ||
void update(const SubMSA &other); | ||
void concat(const SubMSA &other); | ||
void concat(const std::vector<size_t> &other); | ||
}; | ||
|
||
|
||
class MSAContainer { | ||
private: | ||
std::vector<SubMSA> data; | ||
|
||
public: | ||
std::vector<size_t> dbKeys; | ||
std::vector<size_t> dbIdToSubMSAVec; | ||
std::vector<std::vector<Instruction> > cigars_aa; | ||
std::vector<std::vector<Instruction> > cigars_ss; | ||
|
||
MSAContainer(); | ||
MSAContainer(size_t n); | ||
|
||
std::vector<SubMSA>::iterator begin(); | ||
std::vector<SubMSA>::iterator end(); | ||
std::vector<SubMSA>::const_iterator begin() const; | ||
std::vector<SubMSA>::const_iterator end() const; | ||
|
||
SubMSA& operator[](size_t index); | ||
SubMSA& operator[](const std::vector<SubMSA>::iterator& it); | ||
SubMSA& back(); | ||
size_t size() const; | ||
void add(size_t index); | ||
void add(const SubMSA &msa); | ||
void remove(std::vector<size_t> &toRemove); | ||
void addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss); | ||
size_t mergeInto(size_t a, size_t b); | ||
void update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove); | ||
bool isProfile(size_t index); | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.