Skip to content

Commit d695073

Browse files
committed
introduce msacontainer object, overhaul main alignment routine
1 parent 3219c87 commit d695073

File tree

10 files changed

+676
-460
lines changed

10 files changed

+676
-460
lines changed

src/commons/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@ set(commons_source_files
55
commons/StructureSmithWaterman.h
66
commons/newick.cpp
77
commons/newick.h
8+
commons/MSA.cpp
9+
commons/MSA.h
810
PARENT_SCOPE)

src/commons/MSA.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include "MSA.h"
2+
3+
SubMSA::SubMSA() {}
4+
SubMSA::SubMSA(size_t a) : id(a), members({ a }) {}
5+
SubMSA::SubMSA(size_t a, size_t b) : members({ a, b }) {}
6+
7+
void SubMSA::pushMember(size_t other) {
8+
members.insert(members.end(), other);
9+
}
10+
void SubMSA::update(const SubMSA &other) {
11+
members.clear();
12+
members.assign(other.members.begin(), other.members.end());
13+
profile_aa = other.profile_aa;
14+
profile_ss = other.profile_ss;
15+
mask = other.mask;
16+
}
17+
void SubMSA::concat(const SubMSA &other) {
18+
members.insert(members.end(), other.members.begin(), other.members.end());
19+
}
20+
void SubMSA::concat(const std::vector<size_t> &other) {
21+
members.insert(members.end(), other.begin(), other.end());
22+
}
23+
24+
MSAContainer::MSAContainer() {}
25+
MSAContainer::MSAContainer(size_t n) : dbKeys(n), dbIdToSubMSAVec(n, n), cigars_aa(n), cigars_ss(n) {}
26+
27+
std::vector<SubMSA>::iterator MSAContainer::begin() {
28+
return data.begin();
29+
}
30+
std::vector<SubMSA>::iterator MSAContainer::end() {
31+
return data.end();
32+
}
33+
std::vector<SubMSA>::const_iterator MSAContainer::begin() const {
34+
return data.begin();
35+
}
36+
std::vector<SubMSA>::const_iterator MSAContainer::end() const {
37+
return data.end();
38+
}
39+
40+
SubMSA& MSAContainer::operator[](size_t index) {
41+
return data[index];
42+
}
43+
44+
SubMSA& MSAContainer::operator[](const std::vector<SubMSA>::iterator& it) {
45+
return *it;
46+
}
47+
48+
SubMSA& MSAContainer::back() {
49+
return data.back();
50+
}
51+
52+
size_t MSAContainer::size() const {
53+
return data.size();
54+
}
55+
56+
void MSAContainer::add(size_t index) {
57+
data.emplace_back(index);
58+
dbIdToSubMSAVec[index] = data.size() - 1;
59+
}
60+
61+
void MSAContainer::add(const SubMSA &msa) {
62+
data.push_back(msa);
63+
for (size_t i = 0; i < msa.members.size(); i++) {
64+
dbIdToSubMSAVec[msa.members[i]] = data.size() - 1;
65+
}
66+
}
67+
68+
void MSAContainer::remove(std::vector<size_t> &toRemove) {
69+
std::sort(toRemove.begin(), toRemove.end(), std::greater<int>());
70+
for (size_t index : toRemove) {
71+
data.erase(data.begin() + index);
72+
}
73+
for (size_t i = 0; i < data.size(); i++) {
74+
const SubMSA &msa = data[i];
75+
for (size_t member : msa.members) {
76+
dbIdToSubMSAVec[member] = i;
77+
}
78+
}
79+
}
80+
81+
void MSAContainer::addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss) {
82+
for (size_t j = 0; j < length; j++) {
83+
cigars_aa[id].emplace_back(aa[j]);
84+
cigars_ss[id].emplace_back(ss[j]);
85+
}
86+
dbKeys[id] = key;
87+
}
88+
89+
90+
// Merge SubMSA of db id b into SubMSA of db id a
91+
// Returns index of updated SubMSA of db id a
92+
size_t MSAContainer::mergeInto(size_t a, size_t b) {
93+
size_t aIdx = dbIdToSubMSAVec[a];
94+
size_t bIdx = dbIdToSubMSAVec[b];
95+
if (bIdx == cigars_aa.size()) {
96+
// b isn't a profile
97+
data[aIdx].pushMember(b);
98+
dbIdToSubMSAVec[b] = aIdx;
99+
} else {
100+
data[aIdx].concat(data[bIdx]);
101+
for (size_t i = 0; i < data[bIdx].members.size(); i++) {
102+
size_t member = data[bIdx].members[i];
103+
dbIdToSubMSAVec[member] = aIdx;
104+
}
105+
}
106+
return aIdx;
107+
}
108+
109+
// Update the container with newly created SubMSAs, remove stale ones
110+
// 1: new submsa --> add to msa
111+
// 2: one profile, one structure -> add to profile
112+
// 3: both profile -> merge into query
113+
// cases 2 and 3 always identical since profile always made query
114+
void MSAContainer::update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove) {
115+
for (const SubMSA &sub : newMSAs) {
116+
add(sub);
117+
}
118+
if (toRemove.size() > 0) {
119+
remove(toRemove);
120+
}
121+
}
122+
123+
124+
bool MSAContainer::isProfile(size_t index) {
125+
return (dbIdToSubMSAVec[index] != cigars_aa.size());
126+
}

src/commons/MSA.h

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#ifndef MSA_H
2+
#define MSA_H
3+
4+
#include <vector>
5+
#include <string>
6+
#include <map>
7+
#include <numeric>
8+
#include <cstdint>
9+
#include <algorithm>
10+
11+
// Bit field version
12+
// First bit = match or gap
13+
// Remaining bits = ASCII character or (gap) count
14+
union Instruction {
15+
struct BitFields {
16+
std::uint8_t state : 1; // 0 = match, 1 = gap
17+
std::uint8_t count : 7; // count < 127
18+
} bits;
19+
unsigned char data;
20+
Instruction() {
21+
data = 0;
22+
}
23+
Instruction(int state, int count) {
24+
data = 0;
25+
bits.state = static_cast<std::uint8_t>(state);
26+
bits.count = static_cast<std::uint8_t>(count);
27+
}
28+
Instruction(char c) {
29+
data = 0;
30+
bits.state = static_cast<std::uint8_t>(0);
31+
bits.count = static_cast<std::uint8_t>(c);
32+
}
33+
Instruction(int count) {
34+
data = 0;
35+
bits.state = static_cast<std::uint8_t>(1);
36+
bits.count = static_cast<std::uint8_t>(count);
37+
}
38+
char getCharacter() const {
39+
return (bits.state == 0) ? static_cast<char>(bits.count) : '-';
40+
}
41+
bool isSeq() const {
42+
return (bits.state == 0);
43+
}
44+
bool isFull() const {
45+
return (bits.count == 127);
46+
}
47+
};
48+
49+
struct SubMSA {
50+
size_t id; // Database ID of 'merged' representative
51+
std::vector<size_t> members; // Database IDs of member structures
52+
std::string profile_aa; // Amino acid profile
53+
std::string profile_ss; // 3Di profile
54+
std::string mask; // Profile mask string
55+
SubMSA();
56+
SubMSA(size_t a);
57+
SubMSA(size_t a, size_t b);
58+
void pushMember(size_t other);
59+
void update(const SubMSA &other);
60+
void concat(const SubMSA &other);
61+
void concat(const std::vector<size_t> &other);
62+
};
63+
64+
65+
class MSAContainer {
66+
private:
67+
std::vector<SubMSA> data;
68+
69+
public:
70+
std::vector<size_t> dbKeys;
71+
std::vector<size_t> dbIdToSubMSAVec;
72+
std::vector<std::vector<Instruction> > cigars_aa;
73+
std::vector<std::vector<Instruction> > cigars_ss;
74+
75+
MSAContainer();
76+
MSAContainer(size_t n);
77+
78+
std::vector<SubMSA>::iterator begin();
79+
std::vector<SubMSA>::iterator end();
80+
std::vector<SubMSA>::const_iterator begin() const;
81+
std::vector<SubMSA>::const_iterator end() const;
82+
83+
SubMSA& operator[](size_t index);
84+
SubMSA& operator[](const std::vector<SubMSA>::iterator& it);
85+
SubMSA& back();
86+
size_t size() const;
87+
void add(size_t index);
88+
void add(const SubMSA &msa);
89+
void remove(std::vector<size_t> &toRemove);
90+
void addStructure(size_t id, unsigned int key, size_t length, const char* aa, const char* ss);
91+
size_t mergeInto(size_t a, size_t b);
92+
void update(const std::vector<SubMSA> &newMSAs, std::vector<size_t> &toRemove);
93+
bool isProfile(size_t index);
94+
};
95+
96+
#endif

src/commons/StructureSmithWaterman.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,7 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh(
877877
// Adjust CIGAR string to start/end on M
878878
// q/dbStart and q/dbEnd are already correct, no need to adjust here
879879
// q/dbStart set to last M j/i, q/dbEnd last M .ref/.read
880+
size_t alnLength = cigar.length();
880881
trimCIGAR(cigar, qEnd, dbEnd);
881882

882883
delete[] workspace;
@@ -889,7 +890,7 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh(
889890
0, // align.tCov,
890891
0, // seqId
891892
0, // align.evalue,
892-
0, // alnLength
893+
alnLength, // alnLength
893894
qStart,
894895
qEnd,
895896
query_end,

0 commit comments

Comments
 (0)