diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 302919d4c..7e62c7c15 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -2515,9 +2515,9 @@ void Parameters::setDefaults() { resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET; weightThr = 0.9; weightFile = ""; - // TODO: change to true after fixing regression tests - matchAdjacentSeq = false; + matchAdjacentSeq = true; hashSeqBuffer = 1.05; + numDiskBuffer = 0; // result2stats stat = ""; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 0fa75e597..d69a57d0e 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -554,6 +554,7 @@ class Parameters { std::string weightFile; bool matchAdjacentSeq; float hashSeqBuffer; + int numDiskBuffer; // indexdb int checkCompatible; diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 7bb3f076e..f87098787 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -470,6 +470,53 @@ template void swapCenterSequence<0, int, true>(KmerPosition *kmers, s template void swapCenterSequence<1, short, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); template void swapCenterSequence<1, int, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template +void resizeBuffer(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, DBReader & seqDbr, + Parameters & par, BaseMatrix * subMat) { + + Debug(Debug::INFO) << "Resize additional memory\n"; + Timer timer; + KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); + size_t elementsToSort; + if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + elementsToSort = ret.first; + par.kmerSize = ret.second; + }else{ + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + elementsToSort = ret.first; + } + + if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); + }else{ + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); + } + + SequenceWeights *sequenceWeights = NULL; + if (par.PARAM_WEIGHT_FILE.wasSet) { + sequenceWeights = new SequenceWeights(par.weightFile.c_str()); + if (sequenceWeights != NULL) { + if (Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + } else { + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + } + } + } + + std::string splitFile = "None"; + if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ + assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer, splitFile, par.numDiskBuffer); + }else{ + assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer, splitFile, par.numDiskBuffer); + } + Debug(Debug::INFO) << "Time for resizing: " << timer.lap() << "\n\n"; + + delete sequenceWeights; + delete [] hashSeqPair; +} + template KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { @@ -515,17 +562,12 @@ KmerPosition * doComputation(size_t &totalKmers, size_t h // The longest sequence is the first since we sorted by kmer, seq.Len and id size_t writePos; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer, splitFile, par.numDiskBuffer); }else{ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer, splitFile, par.numDiskBuffer); } delete sequenceWeights; - if (writePos == SIZE_T_MAX) { - delete [] hashSeqPair; - totalKmers = 0; - return NULL; - } // sort by rep. sequence (stored in kmer) and sequence id Debug(Debug::INFO) << "Sort by rep. sequence "; @@ -541,7 +583,7 @@ KmerPosition * doComputation(size_t &totalKmers, size_t h // } Debug(Debug::INFO) << timer.lap() << "\n"; - if(hashEndRange != SIZE_T_MAX){ + if(hashEndRange != SIZE_T_MAX || par.numDiskBuffer > 0){ if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ writeKmersToDisk(splitFile, hashSeqPair, writePos + 1); }else{ @@ -556,7 +598,7 @@ KmerPosition * doComputation(size_t &totalKmers, size_t h template size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *, float &) { + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *, float &, std::string, int &) { size_t writePos=0; size_t prevHash = hashSeqPair[0].kmer; @@ -675,8 +717,9 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, b template size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer) { + size_t totalSplitKmerCount = splitKmerCount; // change splitKmerCount to exclude additional memory splitKmerCount = static_cast(splitKmerCount / hashSeqBuffer); // declare variables @@ -700,6 +743,12 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bo size_t repSeqNum = 20; for (size_t elementIdx = 0; elementIdx < splitKmerCount+1; elementIdx++) { + // Reallocate module + if (tmpFile == "None" && elementIdx == static_cast(splitKmerCount / 10)) { + hashSeqBuffer = 1.05 + (static_cast(writeTmp) / static_cast(splitKmerCount)) * 11; + return SIZE_T_MAX; + } + size_t currKmer = hashSeqPair[elementIdx].kmer; if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { currKmer = BIT_SET(currKmer, 63); @@ -796,7 +845,7 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bo hashSeqPair[writePos].id = hashSeqPair[i].id; writePos++; // otherwise, store information sequentially starting from the splitKmerCount - }else if (splitKmerCount + writeTmp < hashSeqBuffer * splitKmerCount) { + }else if (splitKmerCount + writeTmp < totalSplitKmerCount-1) { hashSeqPair[splitKmerCount + writeTmp].kmer = rId; hashSeqPair[splitKmerCount + writeTmp].pos = diagonal; hashSeqPair[splitKmerCount + writeTmp].seqLen = hashSeqPair[i].seqLen; @@ -804,10 +853,39 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bo writeTmp++; // if both are impossible, increase hashSeqPair's memory and split again }else { - // calculate elaborate buffer size based on the progress rate - hashSeqBuffer = 1 + (static_cast(splitKmerCount) / static_cast(writePos)) * (hashSeqBuffer - 1) * 1.2; - Debug(Debug::INFO) << "\n" << "Buffer size is unsufficient, splitting again" << "\n\n"; - return SIZE_T_MAX; + // Reallocate module + if (tmpFile == "None") { + hashSeqBuffer = 1 + (static_cast(splitKmerCount) / static_cast(writePos)) * (hashSeqBuffer - 1) * 1.2; + return SIZE_T_MAX; + } + + // Hard disk writing module + hashSeqPair[splitKmerCount + writeTmp].kmer = rId; + hashSeqPair[splitKmerCount + writeTmp].pos = diagonal; + hashSeqPair[splitKmerCount + writeTmp].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[splitKmerCount + writeTmp].id = hashSeqPair[i].id; + writeTmp++; + + Debug(Debug::INFO) << "\nUnsufficient memory, Record contents to disk\n"; + std::string bufferName = tmpFile + "_" + SSTR(numDiskBuffer); + // if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + // SORT_PARALLEL(&hashSeqPair[splitKmerCount], &hashSeqPair[splitKmerCount] + writeTmp, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + // writeKmersToDisk(bufferName, &hashSeqPair[splitKmerCount], writeTmp + 1); + // }else{ + // SORT_PARALLEL(&hashSeqPair[splitKmerCount], &hashSeqPair[splitKmerCount] + writeTmp, KmerPosition::compareRepSequenceAndIdAndDiag); + // writeKmersToDisk(bufferName, &hashSeqPair[splitKmerCount], writeTmp + 1); + // } + // numDiskBuffer++; + // writeTmp = 0; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + writeKmersToDisk(bufferName, hashSeqPair, writePos); + }else{ + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); + writeKmersToDisk(bufferName, hashSeqPair, writePos); + } + numDiskBuffer++; + writePos = 0; } } } @@ -845,15 +923,15 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bo return writePos; } -template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); -template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); void setLinearFilterDefault(Parameters *p) { @@ -907,8 +985,20 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { Debug(Debug::INFO) << "\n"; float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? par.kmersPerSequenceScale.values.nucleotide() : par.kmersPerSequenceScale.values.aminoacid(); - size_t totalKmers = static_cast(computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale) * par.hashSeqBuffer); + size_t totalKmers = computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale); size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + // resize additional memory + if(IncludeAdjacentSeq){ + size_t tmpSizeNeeded = static_cast(totalSizeNeeded * par.hashSeqBuffer); + size_t splits = static_cast(std::ceil(static_cast(tmpSizeNeeded) / memoryLimit)); + size_t totalKmersPerSplit = std::max(static_cast(1024+1), + static_cast(std::min(tmpSizeNeeded, memoryLimit)/sizeof(KmerPosition))+1); + + std::vector> hashRanges = setupKmerSplits(par, subMat, seqDbr, totalKmersPerSplit, splits); + resizeBuffer(totalKmersPerSplit, hashRanges[0].first, hashRanges[0].second, seqDbr, par, subMat); + } + totalKmers = static_cast(totalKmers * par.hashSeqBuffer); + totalSizeNeeded = static_cast(totalSizeNeeded * par.hashSeqBuffer); // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); size_t totalKmersPerSplit = std::max(static_cast(1024+1), @@ -926,6 +1016,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { splits = hashRanges.size(); size_t fromSplit = 0; size_t splitCount = 1; + std::vector splitBuffers; mpiRank = MMseqsMPI::rank; // if split size is great than nodes than we have to // distribute all splits equally over all nodes @@ -941,37 +1032,39 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { delete[] splitCntPerProc; for(size_t split = fromSplit; split < fromSplit+splitCount; split++) { + par.numDiskBuffer = 0; std::string splitFileName = par.db2 + "_split_" +SSTR(split); hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); - } - // detect insufficient buffer size - if (totalKmers == 0) { - delete subMat; - return EXIT_SUCCESS; + splitBuffers.push_back(par.numDiskBuffer); } MPI_Barrier(MPI_COMM_WORLD); if(mpiRank == 0){ for(size_t split = 0; split < splits; split++) { std::string splitFileName = par.db2 + "_split_" +SSTR(split); splitFiles.push_back(splitFileName); + for(int j = 0; j < splitBuffers[split]; j++) { + splitBufferName = splitFileName + "_" + SSTR(j); + splitFiles.push_back(splitBufferName); + } } } #else for(size_t split = 0; split < hashRanges.size(); split++) { + par.numDiskBuffer = 0; std::string splitFileName = par.db2 + "_split_" +SSTR(split); + std::string splitBufferName; Debug(Debug::INFO) << "Generate k-mers list for " << (split+1) <<" split\n"; std::string splitFileNameDone = splitFileName + ".done"; if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){ hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); } - // detect insufficient buffer size - if (totalKmersPerSplit == 0) { - delete subMat; - return EXIT_SUCCESS; - } splitFiles.push_back(splitFileName); + for(int j = 0; j < par.numDiskBuffer; j++){ + splitBufferName = splitFileName + "_" + SSTR(j); + splitFiles.push_back(splitBufferName); + } } #endif if(mpiRank == 0){ @@ -983,7 +1076,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { dbw.open(); Timer timer; - if(splits > 1) { + if(splits > 1 || par.numDiskBuffer > 0) { seqDbr.unmapData(); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { mergeKmerFilesAndOutput(dbw, splitFiles, repSequence); @@ -1083,43 +1176,13 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr return hashRanges; } -// https://github.com/soedinglab/MMseqs2/pull/873#issue-2464876011 -void matchWithAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { - float hashSeqBuffer; - bool firstIt = true; - do { - // FIXME: currently have to reopen DB every iteration due to DBReader::unmapData() - DBReader seqDbr( - par.db1.c_str(), - par.db1Index.c_str(), - par.threads, - DBReader::USE_INDEX | DBReader::USE_DATA - ); - seqDbr.open(DBReader::NOSORT); - int querySeqType = seqDbr.getDbtype(); - - // print is executed only once - if (firstIt) { - firstIt = false; - setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); - std::vector *params = command.params; - par.printParameters(command.cmd, argc, argv, *params); - Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; - } - - // if the buffer size is insufficient, par.hashSeqBuffer is changed and repeat split again - hashSeqBuffer = par.hashSeqBuffer; - if (seqDbr.getMaxSeqLen() < SHRT_MAX) { - kmermatcherInner(par, seqDbr); - } - else { - kmermatcherInner(par, seqDbr); - } - seqDbr.close(); - } while (hashSeqBuffer != par.hashSeqBuffer); -} +int kmermatcher(int argc, const char **argv, const Command &command) { + MMseqsMPI::init(argc, argv); + + Parameters &par = Parameters::getInstance(); + setLinearFilterDefault(&par); + par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); -void matchWithoutAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); seqDbr.open(DBReader::NOSORT); @@ -1131,31 +1194,25 @@ void matchWithoutAdjacentSeq(Parameters &par,int argc, const char **argv, const Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; if (seqDbr.getMaxSeqLen() < SHRT_MAX) { - kmermatcherInner(par, seqDbr); + if (par.matchAdjacentSeq) { + kmermatcherInner(par, seqDbr); + } + else { + par.hashSeqBuffer = 1.0; + kmermatcherInner(par, seqDbr); + } } else { - kmermatcherInner(par, seqDbr); + if (par.matchAdjacentSeq) { + kmermatcherInner(par, seqDbr); + } + else { + par.hashSeqBuffer = 1.0; + kmermatcherInner(par, seqDbr); + } } seqDbr.close(); -} - -int kmermatcher(int argc, const char **argv, const Command &command) { - MMseqsMPI::init(argc, argv); - - Parameters &par = Parameters::getInstance(); - setLinearFilterDefault(&par); - par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); - - bool matchAdjacentSeq = par.matchAdjacentSeq; - - if (matchAdjacentSeq) { - matchWithAdjacentSeq(par, argc, argv, command); - } else { - // overwrite value (no need for buffer) - par.hashSeqBuffer = 1.0; - matchWithoutAdjacentSeq(par, argc, argv, command); - } return EXIT_SUCCESS; } diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 1b7d158b4..caecc4c2a 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -217,10 +217,10 @@ class CompareResultBySeqId { template size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); template size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer, std::string tmpFile, int &numDiskBuffer); template void mergeKmerFilesAndOutput(DBWriter & dbw, std::vector tmpFiles, std::vector &repSequence); @@ -239,11 +239,13 @@ template void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads); - +template +void resizeBuffer(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, DBReader & seqDbr, + Parameters & par, BaseMatrix * subMat); template KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, - DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, - size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); + DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); template KmerPosition *initKmerPositionMemory(size_t size);