From b0e91c12850cf1cf84093c5a751a94679e1ab831 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Mon, 6 Jan 2025 23:32:03 +0900 Subject: [PATCH] Fix https://github.com/soedinglab/MMseqs2/issues/912 --- src/commons/DBReader.cpp | 36 +++++++++++++++++++++++++++++++--- src/commons/DBReader.h | 3 +++ src/util/convertalignments.cpp | 7 ------- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index f52d2369c..1a73a3e4c 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -185,7 +185,9 @@ template bool DBReader::open(int accessType){ } compression = isCompressed(dbtype); - if(compression == COMPRESSED){ + padded = (getExtendedDbtype(dbtype) & Parameters::DBTYPE_EXTENDED_GPU); + + if(compression == COMPRESSED || padded){ compressedBufferSizes = new size_t[threads]; compressedBuffers = new char*[threads]; dstream = new ZSTD_DStream*[threads]; @@ -530,6 +532,29 @@ template size_t DBReader::bsearch(const Index * index, size_t N, return std::upper_bound(index, index + N, val, Index::compareByIdOnly) - index; } + +template char* DBReader::getUnpadded(size_t id, int thrIdx) { + char *data = getDataUncompressed(id); + size_t seqLen = getSeqLen(id); + + static const char CODE_TO_CHAR[21] = { + 'A', /* 0 */ 'C', /* 1 */ 'D', /* 2 */ + 'E', /* 3 */ 'F', /* 4 */ 'G', /* 5 */ + 'H', /* 6 */ 'I', /* 7 */ 'K', /* 8 */ + 'L', /* 9 */ 'M', /* 10 */ 'N', /* 11 */ + 'P', /* 12 */ 'Q', /* 13 */ 'R', /* 14 */ + 'S', /* 15 */ 'T', /* 16 */ 'V', /* 17 */ + 'W', /* 18 */ 'Y', /* 19 */ 'X' /* 20 */ + }; + + for(size_t i = 0; i < seqLen; i++){ + unsigned char code = static_cast(data[i]); + unsigned char baseCode = (code >= 32) ? code - 32 : code; + compressedBuffers[thrIdx][i] = CODE_TO_CHAR[baseCode]; + } + return compressedBuffers[thrIdx]; +} + template char* DBReader::getDataCompressed(size_t id, int thrIdx) { char *data = getDataUncompressed(id); @@ -573,7 +598,9 @@ template size_t DBReader::getAminoAcidDBSize() { template char* DBReader::getData(size_t id, int thrIdx){ if(compression == COMPRESSED){ return getDataCompressed(id, thrIdx); - }else{ + }else if (padded) { + return getUnpadded(id, thrIdx); + } else { return getDataUncompressed(id); } } @@ -628,7 +655,9 @@ template char* DBReader::getDataByDBKey(T dbKey, int thrIdx) { size_t id = getId(dbKey); if(compression == COMPRESSED ){ return (id != UINT_MAX) ? getDataCompressed(id, thrIdx) : NULL; - }else{ + } if(padded) { + return (id != UINT_MAX) ? getUnpadded(id, thrIdx) : NULL; + } else{ return (id != UINT_MAX) ? getDataByOffset(index[id].offset) : NULL; } } @@ -1016,6 +1045,7 @@ int DBReader::isCompressed(int dbtype) { return (dbtype & (1 << 31)) ? COMPRESSED : UNCOMPRESSED; } + template void DBReader::setSequentialAdvice() { #ifdef HAVE_POSIX_MADVISE diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 57589b1f6..64f274b46 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -174,6 +174,8 @@ class DBReader : public MemoryTracker { char* getDataCompressed(size_t id, int thrIdx); + char* getUnpadded(size_t id, int thrIdx); + char* getDataUncompressed(size_t id); void touchData(size_t id); @@ -479,6 +481,7 @@ class DBReader : public MemoryTracker { // stores the dbtype (if dbtype file exists) int dbtype; int compression; + int padded; char ** compressedBuffers; size_t * compressedBufferSizes; ZSTD_DStream ** dstream; diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp index 79fb63b56..d0e8d6c15 100644 --- a/src/util/convertalignments.cpp +++ b/src/util/convertalignments.cpp @@ -337,9 +337,6 @@ int convertalignments(int argc, const char **argv, const Command &command) { std::string queryProfData; queryProfData.reserve(1024); - std::string queryBuffer; - queryBuffer.reserve(1024); - std::string queryHeaderBuffer; queryHeaderBuffer.reserve(1024); @@ -366,10 +363,6 @@ int convertalignments(int argc, const char **argv, const Command &command) { size_t qId = qDbr.sequenceReader->getId(queryKey); querySeqData = qDbr.sequenceReader->getData(qId, thread_idx); querySeqLen = qDbr.sequenceReader->getSeqLen(qId); - if(sameDB && qDbr.sequenceReader->isCompressed()){ - queryBuffer.assign(querySeqData, querySeqLen); - querySeqData = (char*) queryBuffer.c_str(); - } if (queryProfile) { size_t queryEntryLen = qDbr.sequenceReader->getEntryLen(qId); Sequence::extractProfileConsensus(querySeqData, queryEntryLen, *subMat, queryProfData);