Skip to content

Commit 4766f92

Browse files
Fix masking issue in Index when excluding k-mer
1 parent 934db4f commit 4766f92

File tree

4 files changed

+83
-93
lines changed

4 files changed

+83
-93
lines changed

Diff for: src/prefiltering/IndexBuilder.cpp

+39-49
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ class DbInfo {
5252
};
5353

5454

55-
void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup,
56-
SequenceLookup **unmaskedLookup,BaseMatrix &subMat,
55+
void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup ** externalLookup, BaseMatrix &subMat,
5756
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
5857
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
5958
bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode) {
@@ -65,27 +64,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
6564
size_t dbSize = dbTo - dbFrom;
6665
DbInfo* info = new DbInfo(dbFrom, dbTo, seq->getEffectiveKmerSize(), *dbr);
6766

68-
SequenceLookup *sequenceLookup;
69-
if (unmaskedLookup != NULL && maskedLookup == NULL) {
70-
*unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
71-
sequenceLookup = *unmaskedLookup;
72-
} else if (unmaskedLookup == NULL && maskedLookup != NULL) {
73-
*maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
74-
sequenceLookup = *maskedLookup;
75-
} else if (unmaskedLookup != NULL && maskedLookup != NULL) {
76-
*unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
77-
*maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
78-
sequenceLookup = *maskedLookup;
79-
} else{
80-
Debug(Debug::ERROR) << "This should not happen\n";
81-
EXIT(EXIT_FAILURE);
82-
}
67+
*externalLookup = new SequenceLookup(dbSize, info->aaDbSize);
68+
SequenceLookup *sequenceLookup = *externalLookup;
8369

8470

8571
// identical scores for memory reduction code
8672
char *idScoreLookup = getScoreLookup(subMat);
8773
Debug::Progress progress(dbTo-dbFrom);
88-
74+
bool needMasking = (mask == 1 || maskNrepeats > 0 || maskLowerCaseMode == 1);
8975
size_t maskedResidues = 0;
9076
size_t totalKmerCount = 0;
9177
#pragma omp parallel
@@ -96,16 +82,17 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
9682
#endif
9783
// need to prune low scoring k-mers through masking
9884
Masker *masker = NULL;
99-
if (maskedLookup != NULL) {
85+
if (needMasking) {
10086
masker = new Masker(subMat);
10187
}
10288

103-
104-
Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
89+
unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
90+
: static_cast<unsigned int>(subMat.alphabetSize);
91+
Indexer idxer(alphabetSize, seq->getKmerSize());
10592
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());
10693

10794
KmerGenerator *generator = NULL;
108-
if (isTargetSimiliarKmerSearch) {
95+
if (isTargetSimiliarKmerSearch && indexTable != NULL) {
10996
generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
11097
if(isProfile){
11198
generator->setDivideStrategy(s.profile_matrix);
@@ -132,26 +119,21 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
132119
// count similar or exact k-mers based on sequence type
133120
if (isTargetSimiliarKmerSearch) {
134121
// Find out if we should also mask profiles
135-
totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
136-
unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
137-
if (unmaskedLookup != NULL) {
138-
(*unmaskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
139-
} else if (maskedLookup != NULL) {
140-
(*maskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
122+
if(indexTable != NULL){
123+
totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
141124
}
125+
unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
126+
127+
sequenceLookup->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
128+
142129
} else {
143130
// Do not mask if column state sequences are used
144-
if (unmaskedLookup != NULL) {
145-
(*unmaskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
146-
}
147-
148131
maskedResidues += masker->maskSequence(s, mask, maskProb, maskLowerCaseMode, maskNrepeats);
132+
sequenceLookup->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
149133

150-
if(maskedLookup != NULL){
151-
(*maskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
134+
if(indexTable != NULL){
135+
totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
152136
}
153-
154-
totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
155137
}
156138
}
157139

@@ -168,14 +150,13 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
168150

169151

170152
Debug(Debug::INFO) << "Index table: Masked residues: " << maskedResidues << "\n";
171-
if(totalKmerCount == 0) {
172-
Debug(Debug::ERROR) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
153+
if(indexTable != NULL && totalKmerCount == 0) {
154+
Debug(Debug::WARNING) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
173155
<< "Maybe the sequences length is less than 14 residues.\n";
174156
if (maskedResidues == true){
175-
Debug(Debug::ERROR) << " or contains only low complexity regions.";
176-
Debug(Debug::ERROR) << "Use --mask 0 to deactivate the low complexity filter.\n";
157+
Debug(Debug::WARNING) << " or contains only low complexity regions.";
158+
Debug(Debug::WARNING) << "Use --mask 0 to deactivate the low complexity filter.\n";
177159
}
178-
EXIT(EXIT_FAILURE);
179160
}
180161

181162
dbr->remapData();
@@ -193,9 +174,10 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
193174
// }
194175
// Debug(Debug::INFO) << "Index table: Remove "<< lowSelectiveResidues <<" none selective residues\n";
195176
// Debug(Debug::INFO) << "Index table: init... from "<< dbFrom << " to "<< dbTo << "\n";
196-
197-
indexTable->initMemory(info->tableSize);
198-
indexTable->init();
177+
if(indexTable != NULL){
178+
indexTable->initMemory(info->tableSize);
179+
indexTable->init();
180+
}
199181

200182
delete info;
201183
Debug::Progress progress2(dbTo-dbFrom);
@@ -208,7 +190,9 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
208190
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
209191
#endif
210192
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());
211-
Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
193+
unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
194+
: static_cast<unsigned int>(subMat.alphabetSize);
195+
Indexer idxer(alphabetSize, seq->getKmerSize());
212196
IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
213197
size_t bufferSize = seq->getMaxLen();
214198
KmerGenerator *generator = NULL;
@@ -229,10 +213,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
229213
unsigned int qKey = dbr->getDbKey(id);
230214
if (isTargetSimiliarKmerSearch) {
231215
s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
232-
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
216+
if(indexTable != NULL) {
217+
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
218+
}
233219
} else {
234220
s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
235-
indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
221+
if(indexTable != NULL) {
222+
indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
223+
}
236224
}
237225
}
238226

@@ -245,6 +233,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
245233
if(idScoreLookup!=NULL){
246234
delete[] idScoreLookup;
247235
}
248-
indexTable->revertPointer();
249-
indexTable->sortDBSeqLists();
236+
if(indexTable != NULL){
237+
indexTable->revertPointer();
238+
indexTable->sortDBSeqLists();
239+
}
250240
}

Diff for: src/prefiltering/IndexBuilder.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66

77
class IndexBuilder {
88
public:
9-
static void fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, SequenceLookup **unmaskedLookup,
10-
BaseMatrix &subMat,
9+
static void fillDatabase(IndexTable *indexTable, SequenceLookup **externalLookup, BaseMatrix &subMat,
1110
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
1211
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
1312
bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode);

Diff for: src/prefiltering/Prefiltering.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -531,11 +531,9 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
531531
Parameters::isEqualDbtype(targetSeqType,Parameters::DBTYPE_AMINO_ACIDS))
532532
? alphabetSize -1 : alphabetSize;
533533
indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
534-
SequenceLookup **unmaskedLookup = maskMode == 0 && maskLowerCaseMode == 0 ? &sequenceLookup : NULL;
535-
SequenceLookup **maskedLookup = maskMode == 1 || maskLowerCaseMode == 1 ? &sequenceLookup : NULL;
536534

537535
Debug(Debug::INFO) << "Index table k-mer threshold: " << localKmerThr << " at k-mer size " << kmerSize << " \n";
538-
IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat,
536+
IndexBuilder::fillDatabase(indexTable, &sequenceLookup, *kmerSubMat,
539537
_3merSubMatrix, _2merSubMatrix,
540538
&tseq, tdbr, dbFrom, dbFrom + dbSize,
541539
localKmerThr, maskMode, maskLowerCaseMode,

Diff for: src/prefiltering/PrefilteringIndexReader.cpp

+42-39
Original file line numberDiff line numberDiff line change
@@ -190,14 +190,14 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
190190
(Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
191191
? alphabetSize -1: alphabetSize;
192192

193-
const bool noPrefilter = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
194-
if (noPrefilter) {
195-
splits = 0;
193+
const bool noKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
194+
if (noKmerIndex) {
195+
splits = 1;
196196
}
197197

198198
ScoreMatrix s3;
199199
ScoreMatrix s2;
200-
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noPrefilter == false) {
200+
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noKmerIndex == false) {
201201
int alphabetSize = subMat->alphabetSize;
202202
subMat->alphabetSize = subMat->alphabetSize-1;
203203
s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
@@ -225,36 +225,53 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
225225
continue;
226226
}
227227

228-
IndexTable indexTable(adjustAlphabetSize, kmerSize, false);
228+
IndexTable * indexTable;
229+
if(noKmerIndex){
230+
indexTable = NULL;
231+
} else {
232+
indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
233+
}
229234
SequenceLookup *sequenceLookup = NULL;
230-
IndexBuilder::fillDatabase(&indexTable,
231-
(maskMode == 1 || maskNrepeats > 0 || maskLowerCase == 1) ? &sequenceLookup : NULL,
232-
(maskMode == 0 && maskNrepeats == 0 && maskLowerCase == 0) ? &sequenceLookup : NULL,
235+
IndexBuilder::fillDatabase(indexTable, &sequenceLookup,
233236
*subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
234237
maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode);
235-
indexTable.printStatistics(subMat->num2aa);
236238

237239
if (sequenceLookup == NULL) {
238240
Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
239241
EXIT(EXIT_FAILURE);
240242
}
241-
242-
// save the entries
243243
unsigned int keyOffset = 1000 * s;
244-
Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
245-
char *entries = (char *) indexTable.getEntries();
246-
size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry();
247-
writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
248-
writer.alignToPageSize(SPLIT_INDX + s);
249-
250-
// save the size
251-
Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
252-
char *offsets = (char*)indexTable.getOffsets();
253-
size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t);
254-
writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
255-
writer.alignToPageSize(SPLIT_INDX + s);
256-
indexTable.deleteEntries();
257-
244+
if(noKmerIndex == false){
245+
indexTable->printStatistics(subMat->num2aa);
246+
// save the entries
247+
Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
248+
char *entries = (char *) indexTable->getEntries();
249+
size_t entriesSize = indexTable->getTableEntriesNum() * indexTable->getSizeOfEntry();
250+
writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
251+
writer.alignToPageSize(SPLIT_INDX + s);
252+
253+
// save the size
254+
Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
255+
char *offsets = (char *) indexTable->getOffsets();
256+
size_t offsetsSize = (indexTable->getTableSize() + 1) * sizeof(size_t);
257+
writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
258+
writer.alignToPageSize(SPLIT_INDX + s);
259+
indexTable->deleteEntries();
260+
261+
// ENTRIESNUM
262+
Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
263+
uint64_t entriesNum = indexTable->getTableEntriesNum();
264+
char *entriesNumPtr = (char *) &entriesNum;
265+
writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
266+
writer.alignToPageSize(SPLIT_INDX + s);
267+
268+
// SEQCOUNT
269+
Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
270+
size_t tablesize = indexTable->getSize();
271+
char *tablesizePtr = (char *) &tablesize;
272+
writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
273+
writer.alignToPageSize(SPLIT_INDX + s);
274+
}
258275
Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
259276
int64_t seqindexDataSize = sequenceLookup->getDataSize();
260277
char *seqindexDataSizePtr = (char *) &seqindexDataSize;
@@ -271,20 +288,6 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
271288
writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
272289
writer.alignToPageSize(SPLIT_INDX + s);
273290
delete sequenceLookup;
274-
275-
// ENTRIESNUM
276-
Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
277-
uint64_t entriesNum = indexTable.getTableEntriesNum();
278-
char *entriesNumPtr = (char *) &entriesNum;
279-
writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
280-
writer.alignToPageSize(SPLIT_INDX + s);
281-
282-
// SEQCOUNT
283-
Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
284-
size_t tablesize = indexTable.getSize();
285-
char *tablesizePtr = (char *) &tablesize;
286-
writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
287-
writer.alignToPageSize(SPLIT_INDX + s);
288291
}
289292

290293
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {

0 commit comments

Comments
 (0)