diff --git a/include/reportsystem.h b/include/reportsystem.h index 45aeecc1a..a8bae2305 100644 --- a/include/reportsystem.h +++ b/include/reportsystem.h @@ -313,6 +313,10 @@ enum InfoCode { SimilarityThreshold = 8, + ColumnsToKeep = 9, + + BlockSize = 10, + __MAXINFO }; diff --git a/source/Cleaner.cpp b/source/Cleaner.cpp index 335a614fa..fe0bf7dcb 100644 --- a/source/Cleaner.cpp +++ b/source/Cleaner.cpp @@ -108,9 +108,9 @@ Alignment *Cleaner::cleanByCutValueOverpass( int residueIndex, numberColumnsToRecover, *keptResiduesGaps; int oldNumberKeptResidues = 0, newNumberKeptResidues = 0; Alignment *newAlig = new Alignment(*alig); - double gapThreshold = maxGaps / alig->originalNumberOfSequences; - - debug.report(InfoCode::GapThreshold, new std::string[2]{std::to_string((int) maxGaps), std::to_string(gapThreshold)}); + + double maxGapThreshold = ((double) maxGaps / alig->originalNumberOfSequences) * 100.0; + debug.report(InfoCode::GapThreshold, new std::string[2]{std::to_string((int) maxGaps), std::to_string(maxGapThreshold)}); // Select the columns with a gaps value // less or equal than the cut point. @@ -238,9 +238,10 @@ Alignment *Cleaner::cleanByCutValueOverpass( // Compute new sequences and columns numbers newAlig->Cleaning->removeAllGapsSeqsAndCols(); + + // Return the new alignment reference return newAlig; - } Alignment *Cleaner::cleanByCutValueFallBehind( @@ -535,15 +536,31 @@ Alignment *Cleaner::cleanStrict(int gapCut, const int *gInCol, float simCut, con int i, x, pos, counter, lenBlock; Alignment *newAlig = new Alignment(*alig); - double gapThreshold = (double) gapCut / alig->originalNumberOfSequences; - debug.report(InfoCode::GapThreshold, new std::string[2]{std::to_string(gapCut), std::to_string(gapThreshold)}); + double maxGapThreshold = ((double) gapCut / alig->originalNumberOfSequences) * 100.0; + debug.report(InfoCode::GapThreshold, new std::string[2]{std::to_string(gapCut), std::to_string(maxGapThreshold)}); + debug.report(InfoCode::SimilarityThreshold, new std::string[1]{std::to_string(simCut)}); // Reject columns with gaps number greater than the gap threshold. - for (i = 0; i < alig->originalNumberOfResidues; i++) { - if (gInCol[i] > gapCut || MDK_W[i] < simCut) - newAlig->saveResidues[i] = -1; + int residueIndex = 0; + int oldNumberKeptResidues = 0; + int newNumberKeptResidues = 0; + for (residueIndex = 0; residueIndex < alig->originalNumberOfResidues; residueIndex++) { + if (alig->saveResidues[residueIndex] == -1) continue; + + if (gInCol[residueIndex] > gapCut || MDK_W[residueIndex] < simCut) { + newAlig->saveResidues[residueIndex] = -1; + } else { + newNumberKeptResidues++; + } + + oldNumberKeptResidues++; } + double percentageColumnsOriginal = ((double) oldNumberKeptResidues / alig->originalNumberOfResidues) * 100.0; + debug.report(InfoCode::ColumnsToKeep, new std::string[3]{"before strict method", std::to_string(oldNumberKeptResidues), std::to_string(percentageColumnsOriginal)}); + percentageColumnsOriginal = ((double) newNumberKeptResidues / alig->originalNumberOfResidues) * 100.0; + debug.report(InfoCode::ColumnsToKeep, new std::string[3]{"after strict method clean based on gaps and similarity", std::to_string(newNumberKeptResidues), std::to_string(percentageColumnsOriginal)}); + // Rescue residues based on their neighbouring residues. We are going to rescue those residues that would be rejected but have at least, 3 non-rejected residues. { // We're going to store a 5-value window of values that we are goint to move residue by residue. @@ -727,6 +744,10 @@ Alignment *Cleaner::cleanStrict(int gapCut, const int *gInCol, float simCut, con // Compute new sequences and columns numbers newAlig->Cleaning->removeAllGapsSeqsAndCols(); + percentageColumnsOriginal = ((double) newAlig->numberOfResidues / alig->originalNumberOfResidues) * 100.0; + debug.report(InfoCode::BlockSize, new std::string[1]{std::to_string(blockSize)}); + debug.report(InfoCode::ColumnsToKeep, new std::string[3]{"after strict method recovered neighbours and filtered by block size", std::to_string(newAlig->numberOfResidues), std::to_string(percentageColumnsOriginal)}); + return newAlig; } diff --git a/source/reportMessages/infoMessages.cpp b/source/reportMessages/infoMessages.cpp index 6c1bb4ac1..52435d02c 100644 --- a/source/reportMessages/infoMessages.cpp +++ b/source/reportMessages/infoMessages.cpp @@ -53,10 +53,16 @@ const std::map reporting::reportManager::InfoMessages = "No filtering or calculation performed. Output same MSA as input."}, {InfoCode::GapThreshold, - "The maximum amount of gaps is \"[tag]\". The threshold value used is \"[tag]\". Column/sequence trimming may be affected by other parameters (e.g." - " conservation threshold)."}, + "The maximum amount of gaps is \"[tag]\", representing a gap in the \"[tag]\" percent or more of the sequences. Column/sequence trimming may be affected by other parameters (e.g." + " conservation threshold, block size)."}, {InfoCode::SimilarityThreshold, "The similarity threshold value used is \"[tag]\". Column/sequence trimming may be affected by other parameters (e.g." - " conservation threshold)."} + " conservation threshold, block size)."}, + + {InfoCode::ColumnsToKeep, + "The number of columns to keep [tag] is \"[tag]\", representing \"[tag]\" percent of the original alignment"}, + + {InfoCode::BlockSize, + "The block size used is \"[tag]\""}, }; \ No newline at end of file