From ee4916c9cbdfedaa715d618607d629e1278bb375 Mon Sep 17 00:00:00 2001 From: Cameron Gilchrist Date: Tue, 5 Dec 2023 15:06:02 +0900 Subject: [PATCH] fix errors --- src/commons/Newick.cpp | 69 -------------------------- src/commons/Newick.h | 22 -------- src/commons/StructureSmithWaterman.cpp | 13 ++--- src/commons/StructureSmithWaterman.h | 7 +-- src/strucclustutils/refinemsa.cpp | 23 ++++----- src/strucclustutils/refinemsa.h | 2 +- src/strucclustutils/structuremsa.cpp | 28 ++++------- src/strucclustutils/structuremsa.h | 24 +-------- 8 files changed, 33 insertions(+), 155 deletions(-) delete mode 100644 src/commons/Newick.cpp delete mode 100644 src/commons/Newick.h diff --git a/src/commons/Newick.cpp b/src/commons/Newick.cpp deleted file mode 100644 index 7a08565..0000000 --- a/src/commons/Newick.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include -#include -#include "Newick.h" - - -void parseNewick(std::string newick) { - std::cout << "Tree:\n\t" << newick << std::endl; - - std::vector nodes; - std::vector ancestors; - - nodes.emplace_back(); - TreeNode *node = &nodes.back(); - TreeNode *subtree = nullptr; - - std::string pattern("\\s*(;|\\(|\\)|,|:)\\s*"); - std::regex regex(pattern); - std::sregex_token_iterator iter(newick.begin(), newick.end(), regex, { -1, 0 } ); - std::sregex_token_iterator end; - std::string prevToken; - - while (iter != end) { - std::string token = *iter++; - - if (token == "") - continue; - - if (token == "(") { - nodes.emplace_back(); - subtree = &nodes.back(); - node->children.push_back(subtree); - ancestors.push_back(node); - node = subtree; - - } else if (token == ",") { - nodes.emplace_back(); - subtree = &nodes.back(); - ancestors.back()->children.push_back(subtree); - node = subtree; - - } else if (token == ")") { - node = ancestors.back(); - ancestors.pop_back(); - - } else if (token == ":") { - - } else { - if (prevToken == "") - continue; - if (prevToken == ")" || prevToken == "(" || prevToken == ",") { - node->name = token; - } else { - node->branchLength = std::stof(token); - } - } - - prevToken = token; - } - - std::cout << "Ancestors: " << ancestors.size() << std::endl; - for (size_t i = 0; i < nodes.size(); i++) { - std::cout << "Node: " << nodes[i].name << ", children: "; - for (size_t j = 0; j < nodes[i].children.size(); j++) { - std::cout << nodes[i].children[j]->name << ", "; - } - std::cout << std::endl; - } - -} \ No newline at end of file diff --git a/src/commons/Newick.h b/src/commons/Newick.h deleted file mode 100644 index 07fec92..0000000 --- a/src/commons/Newick.h +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include -#include - -struct TreeNode { - std::string name; - float branchLength; - std::vector children; - TreeNode *parent = nullptr; -}; - -class NewickParser { - private: - std::vector nodes; - TreeNode tree; - int treeSize; - std::map label; - public: -}; - - -void parseNewick(std::string newick); \ No newline at end of file diff --git a/src/commons/StructureSmithWaterman.cpp b/src/commons/StructureSmithWaterman.cpp index af9753a..6d13b3b 100644 --- a/src/commons/StructureSmithWaterman.cpp +++ b/src/commons/StructureSmithWaterman.cpp @@ -645,7 +645,7 @@ StructureSmithWaterman::s_align StructureSmithWaterman::alignStartPosBacktrace(const unsigned char*, const unsigned char*, int32_t, const uint8_t, const uint8_t, const uint8_t, std::string& , StructureSmithWaterman::s_align, const int, const float, const int32_t); -void trimCIGAR(std::string &cigar, int &qStart, int &qEnd, int &tStart, int &tEnd) { +void trimCIGAR(std::string &cigar, int &qEnd, int &tEnd) { int i = 0; while (cigar[i] != 'M') { if (cigar[i] == 'D') { @@ -714,12 +714,13 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh( short **target_profile_word_3di, int32_t query_start, int32_t query_end, int32_t target_start, int32_t target_end, - const short gap_open, const short gap_extend, bool targetIsProfile, - size_t queryId, - size_t targetId + const short gap_open, const short gap_extend + // bool targetIsProfile, + // size_t queryId, + // size_t targetId ) { // defining constants for backtracing - const uint8_t B = 0b00000001; + // const uint8_t B = 0b00000001; const uint8_t H = 0b00000010; const uint8_t F = 0b00000100; const uint8_t E = 0b00001000; @@ -876,7 +877,7 @@ Matcher::result_t StructureSmithWaterman::simpleGotoh( // Adjust CIGAR string to start/end on M // q/dbStart and q/dbEnd are already correct, no need to adjust here // q/dbStart set to last M j/i, q/dbEnd last M .ref/.read - trimCIGAR(cigar, qStart, qEnd, dbStart, dbEnd); + trimCIGAR(cigar, qEnd, dbEnd); delete[] workspace; delete[] btMatrix; diff --git a/src/commons/StructureSmithWaterman.h b/src/commons/StructureSmithWaterman.h index 5afec17..bd7100c 100644 --- a/src/commons/StructureSmithWaterman.h +++ b/src/commons/StructureSmithWaterman.h @@ -160,9 +160,10 @@ class StructureSmithWaterman{ short **target_profile_word_3di, int32_t query_start, int32_t query_end, int32_t target_start, int32_t target_end, - const short gap_open, const short gap_extend, bool targetIsProfile, - size_t queryId, - size_t targetId + const short gap_open, const short gap_extend + // bool targetIsProfile, + // size_t queryId, + // size_t targetId ); /*! @function Create the query profile using the query sequence. diff --git a/src/strucclustutils/refinemsa.cpp b/src/strucclustutils/refinemsa.cpp index 67d80ce..b371e1a 100644 --- a/src/strucclustutils/refinemsa.cpp +++ b/src/strucclustutils/refinemsa.cpp @@ -86,7 +86,6 @@ void refineOne( MsaFilter &filter_3di, SubstitutionMatrix &subMat_3di, StructureSmithWaterman &structureSmithWaterman, - float matchRatio, std::vector &seqLens, bool filterMsa, bool compBiasCorrection, @@ -99,7 +98,6 @@ void refineOne( bool wg, int gapExtend, int gapOpen, - size_t maxSeqLength, std::vector &sequences_aa, std::vector &sequences_ss ) { @@ -125,8 +123,8 @@ void refineOne( deleteGapCols(group2, cigars_aa, cigars_ss); // generate masks for each sub MSA - std::string mask1 = computeProfileMask(group1, cigars_aa, seqLens, subMat_aa, 1); - std::string mask2 = computeProfileMask(group2, cigars_aa, seqLens, subMat_aa, 1); + std::string mask1 = computeProfileMask(group1, cigars_aa, seqLens, subMat_aa, 1.0); + std::string mask2 = computeProfileMask(group2, cigars_aa, seqLens, subMat_aa, 1.0); std::vector map1 = maskToMapping(mask1); std::vector map2 = maskToMapping(mask2); @@ -134,22 +132,22 @@ void refineOne( std::string profile1_aa = msa2profile( group1, cigars_aa, mask1, calculator_aa, filter_aa, subMat_aa, filterMsa, compBiasCorrection, qid, filterMaxSeqId, - Ndiff, covMSAThr, qsc, filterMinEnable, wg, maxSeqLength + Ndiff, covMSAThr, qsc, filterMinEnable, wg ); std::string profile1_ss = msa2profile( group1, cigars_ss, mask1, calculator_3di, filter_3di, subMat_3di, filterMsa, compBiasCorrection, qid, filterMaxSeqId, - Ndiff, covMSAThr, qsc, filterMinEnable, wg, maxSeqLength + Ndiff, covMSAThr, qsc, filterMinEnable, wg ); std::string profile2_aa = msa2profile( group2, cigars_aa, mask2, calculator_aa, filter_aa, subMat_aa, filterMsa, compBiasCorrection, qid, filterMaxSeqId, - Ndiff, covMSAThr, qsc, filterMinEnable, wg, maxSeqLength + Ndiff, covMSAThr, qsc, filterMinEnable, wg ); std::string profile2_ss = msa2profile( group2, cigars_ss, mask2, calculator_3di, filter_3di, subMat_3di, filterMsa, compBiasCorrection, qid, filterMaxSeqId, - Ndiff, covMSAThr, qsc, filterMinEnable, wg, maxSeqLength + Ndiff, covMSAThr, qsc, filterMinEnable, wg ); assert(profile1_aa.length() == profile1_ss.length()); assert(profile2_aa.length() == profile2_ss.length()); @@ -213,7 +211,6 @@ void refineMany( bool compBiasCorrection, bool wg, float filterMaxSeqId, - float matchRatio, float qsc, int Ndiff, float covMSAThr, @@ -256,9 +253,9 @@ void refineMany( cigars_new_aa, cigars_new_ss, calculator_aa, filter_aa, subMat_aa, calculator_3di, filter_3di, subMat_3di, - structureSmithWaterman, matchRatio, lengths, filterMsa, compBiasCorrection, + structureSmithWaterman, lengths, filterMsa, compBiasCorrection, qid, filterMaxSeqId, Ndiff, covMSAThr, qsc, filterMinEnable, - wg, gapExtend, gapOpen, maxSeqLen, + wg, gapExtend, gapOpen, sequences_aa, sequences_ss ); float lddtScore = std::get<2>(calculate_lddt(cigars_new_aa, subset, indices, lengths, seqDbrCA, pairThreshold)); @@ -277,7 +274,7 @@ void refineMany( if (delta > 0.0) { std::cout << std::fixed << std::setprecision(4) << "Final LDDT: " << prevLDDT << " (+" << delta << ")\n"; } else { - std::cout << "Could not improve MSA\n"; + std::cout << "Did not improve MSA\n"; } for (size_t i = 0; i < sequences_aa.size(); i++) { delete sequences_aa[i]; @@ -361,7 +358,7 @@ int refinemsa(int argc, const char **argv, const Command& command) { tinySubMatAA, tinySubMat3Di, &seqDbrCA, cigars_aa, cigars_ss, calculator_aa, filter_aa, subMat_aa, calculator_3di, filter_3di, subMat_3di, structureSmithWaterman, par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId, - par.matchRatio, par.qsc, par.Ndiff, par.covMSAThr, + par.qsc, par.Ndiff, par.covMSAThr, par.filterMinEnable, par.filterMsa, par.gapExtend.values.aminoacid(), par.gapOpen.values.aminoacid(), par.maxSeqLen, par.qid, par.pairThreshold, indices, lengths ); diff --git a/src/strucclustutils/refinemsa.h b/src/strucclustutils/refinemsa.h index b5b5e85..247b018 100644 --- a/src/strucclustutils/refinemsa.h +++ b/src/strucclustutils/refinemsa.h @@ -18,7 +18,7 @@ void refineMany( bool compBiasCorrection, bool wg, float filterMaxSeqId, - float matchRatio, + // float matchRatio, float qsc, int Ndiff, float covMSAThr, diff --git a/src/strucclustutils/structuremsa.cpp b/src/strucclustutils/structuremsa.cpp index 06a996e..11c8f85 100644 --- a/src/strucclustutils/structuremsa.cpp +++ b/src/strucclustutils/structuremsa.cpp @@ -9,7 +9,6 @@ #include "MathUtil.h" #include "MsaFilter.h" #include "MultipleAlignment.h" -#include "Newick.h" #include "PSSMCalculator.h" #include "Parameters.h" #include "Sequence.h" @@ -303,10 +302,7 @@ Matcher::result_t pairwiseAlignment( 0, target_aa->L, gapOpen, - gapExtend, - targetIsProfile, - query_aa->getId(), - target_aa->getId() + gapExtend ); for (int32_t i = 0; i < aligner.get_profile()->alphabetSize; i++) { @@ -671,7 +667,7 @@ std::string computeProfileMask( for (int i = 0; i < lengthWithGaps; i++) { float matches = colValues[i]; float gaps = colValues[lengthWithGaps + i]; - bool state = (gaps / (gaps + matches)) > matchRatio; + bool state = (gaps / (gaps + matches)) >= matchRatio; mask.push_back(state ? '1' : '0'); } @@ -705,8 +701,7 @@ std::string msa2profile( float covMSAThr, float qsc, int filterMinEnable, - bool wg, - size_t maxSeqLength + bool wg ) { // length of sequences after masking int lengthWithMask = 0; @@ -1240,11 +1235,10 @@ void updateCIGARS( updateTargetCIGAR(cigars_aa[index], cigars_ss[index], tBt, tPreGaps, tPreSequence, tEndGaps, tEndSequence); } -void testSeqLens(std::vector &indices, std::vector > &cigars, std::vector &lengths) { - for (int index : indices) { - int length = cigarLength(cigars[index], false); +void testSeqLens(std::vector &MAYBE_UNUSED(indices), std::vector > &MAYBE_UNUSED(cigars), std::vector &MAYBE_UNUSED(lengths)) { + for (int MAYBE_UNUSED(index) : indices) { + assert(lengths[index] == cigarLength(cigars[index], false)); // std::cout << headers[index] << '\t' << lengths[index] << '\t' << length << '\n'; - assert(lengths[index] == length); } } @@ -1706,7 +1700,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl if (true) { // calculate LDDT of merged alignment float lddtScore = std::get<2>(calculate_lddt(cigars_aa, groups[mergedId], dbKeys, seqLens, &seqDbrCA, par.pairThreshold)); - std::cout << std::fixed << std::setprecision(3) + std::cout << std::fixed << std::setprecision(4) << queryIsProfile << "\t" << targetIsProfile << '\t' << headers[mergedId] << "\t" << headers[targetId] << "\tLDDT: " << lddtScore << '\t' << res.score; if (tmaligned){ @@ -1737,8 +1731,7 @@ if (true) { par.covMSAThr, par.qsc, par.filterMinEnable, - par.wg, - 1 + par.wg ); std::string profile_3di = msa2profile( groups[mergedId], @@ -1755,8 +1748,7 @@ if (true) { par.covMSAThr, par.qsc, par.filterMinEnable, - par.wg, - par.maxSeqLen + par.wg ); assert(profile_aa.length() == profile_3di.length()); @@ -1790,7 +1782,7 @@ if (true) { refineMany( tinySubMatAA, tinySubMat3Di, &seqDbrCA, cigars_aa, cigars_ss, calculator_aa, filter_aa, subMat_aa, calculator_3di, filter_3di, subMat_3di, structureSmithWaterman, - par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId, par.matchRatio, par.qsc, + par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId, par.qsc, par.Ndiff, par.covMSAThr, par.filterMinEnable, par.filterMsa, par.gapExtend.values.aminoacid(), par.gapOpen.values.aminoacid(), par.maxSeqLen, par.qid, par.pairThreshold, dbKeys, seqLens ); diff --git a/src/strucclustutils/structuremsa.h b/src/strucclustutils/structuremsa.h index 12c852a..f5768fb 100644 --- a/src/strucclustutils/structuremsa.h +++ b/src/strucclustutils/structuremsa.h @@ -63,27 +63,6 @@ union Instruction2 { } }; -std::string fastamsa2profile( - std::string & msa, - PSSMCalculator &pssmCalculator, - MsaFilter &filter, - SubstitutionMatrix &subMat, - size_t maxSeqLength, - size_t maxSetSize, - float matchRatio, - bool filterMsa, - bool compBiasCorrection, - std::string & qid, - float filterMaxSeqId, - float Ndiff, - float covMSAThr, - float qsc, - int filterMinEnable, - bool wg, - bool *externalMaskedColumns, - float scoreBias -); - void getMergeInstructions( Matcher::result_t &res, std::vector &map1, @@ -131,8 +110,7 @@ std::string msa2profile( float covMSAThr, float qsc, int filterMinEnable, - bool wg, - size_t maxSeqLength + bool wg ); void updateCIGARS(