Skip to content

Commit a2d01d0

Browse files
Rework masking and add N repeat masking
1 parent e3b16fa commit a2d01d0

22 files changed

+241
-181
lines changed

Diff for: lib/tantan/tantan.cpp

+9-5
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ struct Tantan {
479479
}
480480
};
481481

482-
void maskSequences(uchar *seqBeg,
482+
int maskSequences(uchar *seqBeg,
483483
uchar *seqEnd,
484484
int maxRepeatOffset,
485485
const const_double_ptr *likelihoodRatioMatrix,
@@ -498,7 +498,7 @@ void maskSequences(uchar *seqBeg,
498498
repeatOffsetProbDecay, firstGapProb, otherGapProb,
499499
probabilities);
500500

501-
maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable);
501+
return maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable);
502502
}
503503

504504
void getProbabilities(const uchar *seqBeg,
@@ -517,17 +517,21 @@ void getProbabilities(const uchar *seqBeg,
517517
tantan.calcRepeatProbs(probabilities);
518518
}
519519

520-
void maskProbableLetters(uchar *seqBeg,
520+
int maskProbableLetters(uchar *seqBeg,
521521
uchar *seqEnd,
522522
const float *probabilities,
523523
double minMaskProb,
524524
const uchar *maskTable) {
525+
int masked = 0;
525526
while (seqBeg < seqEnd) {
526-
if (*probabilities >= minMaskProb)
527-
*seqBeg = maskTable[*seqBeg];
527+
if (*probabilities >= minMaskProb) {
528+
*seqBeg = maskTable[*seqBeg];
529+
masked++;
530+
}
528531
++probabilities;
529532
++seqBeg;
530533
}
534+
return masked;
531535
}
532536

533537
void countTransitions(const uchar *seqBeg,

Diff for: lib/tantan/tantan.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ namespace tantan {
5757
typedef unsigned char uchar;
5858
typedef const double *const_double_ptr;
5959

60-
void maskSequences(uchar *seqBeg,
60+
int maskSequences(uchar *seqBeg,
6161
uchar *seqEnd,
6262
int maxRepeatOffset,
6363
const const_double_ptr *likelihoodRatioMatrix,
@@ -87,7 +87,7 @@ void getProbabilities(const uchar *seqBeg,
8787
// The following routine masks each letter whose corresponding entry
8888
// in "probabilities" is >= minMaskProb.
8989

90-
void maskProbableLetters(uchar *seqBeg,
90+
int maskProbableLetters(uchar *seqBeg,
9191
uchar *seqEnd,
9292
const float *probabilities,
9393
double minMaskProb,

Diff for: src/alignment/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ set(alignment_header_files
66
alignment/MsaFilter.h
77
alignment/MultipleAlignment.h
88
alignment/PSSMCalculator.h
9-
alignment/PSSMMasker.h
109
alignment/StripedSmithWaterman.h
1110
alignment/BandedNucleotideAligner.h
1211
alignment/DistanceCalculator.h

Diff for: src/alignment/PSSMMasker.h

-52
This file was deleted.

Diff for: src/commons/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ set(commons_header_files
2222
commons/itoa.h
2323
commons/KSeqBufferReader.h
2424
commons/KSeqWrapper.h
25+
commons/Masker.h
2526
commons/MathUtil.h
2627
commons/MemoryMapped.h
2728
commons/MemoryTracker.h
@@ -60,6 +61,7 @@ set(commons_source_files
6061
commons/FileUtil.cpp
6162
commons/HeaderSummarizer.cpp
6263
commons/KSeqWrapper.cpp
64+
commons/Masker.cpp
6365
commons/MemoryMapped.cpp
6466
commons/MemoryTracker.cpp
6567
commons/MMseqsMPI.cpp

Diff for: src/commons/Masker.cpp

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#include "Masker.h"
2+
#include <algorithm> // for std::toupper
3+
4+
Masker::Masker(BaseMatrix &s) : subMat(s), probMatrix(s)
5+
{
6+
maxSeqLen = 1;
7+
charSequence = (unsigned char *)malloc(maxSeqLen * sizeof(char));
8+
maskLetterNum = subMat.aa2num[(int)'X'];
9+
}
10+
11+
Masker::~Masker() {
12+
free(charSequence);
13+
}
14+
15+
int Masker::maskSequence(Sequence & seq, bool maskTantan, double maskProb,
16+
bool maskLowerCaseLetter, int maskNrepeats) {
17+
18+
int maskedResidues = 0;
19+
20+
if(maskTantan){
21+
// 1. Apply tantan masking without influencing by repeat mask
22+
maskedResidues += tantan::maskSequences(seq.numSequence,
23+
(seq.numSequence + seq.L),
24+
50 /*maxCycleLength*/,
25+
probMatrix.probMatrixPointers,
26+
0.005 /*repeatProb*/,
27+
0.05 /*repeatEndProb*/,
28+
0.9 /*repeatOffsetProbDecay*/,
29+
0, 0,
30+
maskProb /*minMaskProb*/,
31+
probMatrix.hardMaskTable);
32+
}
33+
if( maskNrepeats > 0){
34+
// 2. Generate the mask for repeats
35+
maskedResidues += maskRepeats(seq.numSequence, seq.L, maskNrepeats, maskLetterNum);
36+
}
37+
// 3. Handle lowercase masking
38+
if(maskLowerCaseLetter){
39+
if ((Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
40+
Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
41+
const char *charSeq = seq.getSeqData();
42+
for (int i = 0; i < seq.L; i++) {
43+
if (std::islower((unsigned char)charSeq[i])) {
44+
seq.numSequence[i] = maskLetterNum; // Apply masking
45+
maskedResidues++;
46+
}
47+
}
48+
}
49+
}
50+
// 4. Finalize masking
51+
if(maskTantan || maskNrepeats || maskLowerCaseLetter){
52+
finalizeMasking(seq.numSequence, seq.L);
53+
}
54+
return maskedResidues;
55+
}
56+
57+
void Masker::maskPssm(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes) {
58+
if ((size_t)centerSequence.L > maxSeqLen) {
59+
maxSeqLen = sizeof(char) * centerSequence.L * 1.5;
60+
charSequence = (unsigned char*)realloc(charSequence, maxSeqLen);
61+
}
62+
memcpy(charSequence, centerSequence.numSequence, sizeof(unsigned char) * centerSequence.L);
63+
tantan::maskSequences(charSequence, charSequence + centerSequence.L,
64+
50 /*options.maxCycleLength*/,
65+
probMatrix.probMatrixPointers,
66+
0.005 /*options.repeatProb*/,
67+
0.05 /*options.repeatEndProb*/,
68+
0.9 /*options.repeatOffsetProbDecay*/,
69+
0, 0,
70+
maskProb /*options.minMaskProb*/,
71+
probMatrix.hardMaskTable);
72+
73+
for (int pos = 0; pos < centerSequence.L; pos++) {
74+
if (charSequence[pos] == maskLetterNum) {
75+
for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
76+
pssmRes.pssm[pos * Sequence::PROFILE_AA_SIZE + aa] = -1;
77+
}
78+
}
79+
}
80+
}
81+
82+
83+
int Masker::maskRepeats(unsigned char * numSequence, const unsigned int seqLen, int maskNrepeating, char maskChar) {
84+
85+
unsigned int repeatCount = 0;
86+
int startOfRepeat = -1;
87+
char previousChar = '\0';
88+
int maskedResidues = 0; // Counter for masked residues
89+
90+
for (unsigned int pos = 0; pos < seqLen; ++pos) {
91+
char currentChar = numSequence[pos];
92+
93+
if (currentChar == previousChar) {
94+
repeatCount++;
95+
} else {
96+
if (repeatCount > (unsigned int)maskNrepeating) {
97+
for (unsigned int i = startOfRepeat; i < pos; ++i) {
98+
numSequence[i] = maskChar;
99+
maskedResidues++;
100+
}
101+
}
102+
repeatCount = 1;
103+
startOfRepeat = pos;
104+
previousChar = currentChar;
105+
}
106+
}
107+
108+
// Handle the last run
109+
if (repeatCount > (unsigned int)maskNrepeating) {
110+
for (unsigned int i = startOfRepeat; i < seqLen; ++i) {
111+
numSequence[i] = maskChar;
112+
maskedResidues++;
113+
}
114+
}
115+
116+
return maskedResidues;
117+
}
118+
119+
void Masker::finalizeMasking(unsigned char * numSequence, const unsigned int seqLen) {
120+
unsigned char maskChar = probMatrix.hardMaskTable[0];
121+
122+
for (unsigned int i = 0; i < seqLen; i++) {
123+
unsigned char code = numSequence[i];
124+
numSequence[i] = (code == maskChar || code == maskLetterNum) ? maskLetterNum : numSequence[i];
125+
}
126+
}
127+
128+
void Masker::applySoftmasking(unsigned char *charSequence, const unsigned char * num_sequence, unsigned int seqLen) {
129+
for (unsigned int pos = 0; pos < seqLen; pos++) {
130+
// If masked, lowercase (soft) or uppercase (hard) could be applied here if needed.
131+
// For simplicity, we treat maskChar as masked and others as uppercase:
132+
charSequence[pos] = (num_sequence[pos] == maskLetterNum)
133+
? (char)std::tolower(charSequence[pos])
134+
: (char)std::toupper(charSequence[pos]);
135+
}
136+
}

Diff for: src/commons/Masker.h

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#ifndef MMSEQS_MASKER_H
2+
#define MMSEQS_MASKER_H
3+
4+
#include "Parameters.h"
5+
#include "Sequence.h"
6+
#include "SubstitutionMatrix.h"
7+
#include "tantan.h"
8+
#include "PSSMCalculator.h"
9+
#include <cctype>
10+
11+
class Masker {
12+
public:
13+
Masker(BaseMatrix &subMat);
14+
15+
~Masker();
16+
17+
int maskSequence(Sequence & seq, bool maskTantan, double maskProb,
18+
bool maskLowerCaseLetter, int maskNrepeating);
19+
20+
void maskPssm(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes);
21+
22+
void applySoftmasking(unsigned char *charSequence, const unsigned char * numSequence, unsigned int seqLen);
23+
24+
char maskLetterNum;
25+
26+
private:
27+
int maskRepeats(unsigned char *numSequence, const unsigned int seqLen, int maskNrepeating, char maskChar);
28+
29+
void finalizeMasking(unsigned char * numSequence, const unsigned int seqLen);
30+
31+
BaseMatrix &subMat;
32+
ProbabilityMatrix probMatrix;
33+
34+
unsigned char * charSequence;
35+
size_t maxSeqLen;
36+
};
37+
#endif

Diff for: src/commons/Parameters.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,10 @@ Parameters::Parameters():
4444
PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID, "--max-seq-len", "Max sequence length", "Maximum sequence length", typeid(size_t), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
4545
PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID, "--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
4646
PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID, "--exact-kmer-matching", "Exact k-mer matching", "Extract only exact k-mers for matching (range 0-1)", typeid(int), (void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
47-
PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in k-mer stage: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
47+
PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in prefilter stage with tantan: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
4848
PARAM_MASK_PROBABILTY(PARAM_MASK_PROBABILTY_ID, "--mask-prob", "Mask residues probability", "Mask sequences is probablity is above threshold", typeid(float), (void *) &maskProb, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
4949
PARAM_MASK_LOWER_CASE(PARAM_MASK_LOWER_CASE_ID, "--mask-lower-case", "Mask lower case residues", "Lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region", typeid(int), (void *) &maskLowerCaseMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
50+
PARAM_MASK_N_REPEAT(PARAM_MASK_N_REPEAT_ID, "--mask-n-repeat", "Mask lower letter repeating N times", "Repeat letters that occure > threshold in a rwo", typeid(int), (void *) &maskNrepeats, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
5051
PARAM_MIN_DIAG_SCORE(PARAM_MIN_DIAG_SCORE_ID, "--min-ungapped-score", "Minimum diagonal score", "Accept only matches with ungapped alignment score above threshold", typeid(int), (void *) &minDiagScoreThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
5152
PARAM_K_SCORE(PARAM_K_SCORE_ID, "--k-score", "k-score", "k-mer threshold for generating similar k-mer lists", typeid(MultiParam<SeqProf<int>>), (void *) &kmerScore, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
5253
PARAM_MAX_SEQS(PARAM_MAX_SEQS_ID, "--max-seqs", "Max results per query", "Maximum results per query sequence allowed to pass the prefilter (affects sensitivity)", typeid(size_t), (void *) &maxResListLen, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER),
@@ -427,6 +428,7 @@ Parameters::Parameters():
427428
prefilter.push_back(&PARAM_MASK_RESIDUES);
428429
prefilter.push_back(&PARAM_MASK_PROBABILTY);
429430
prefilter.push_back(&PARAM_MASK_LOWER_CASE);
431+
prefilter.push_back(&PARAM_MASK_N_REPEAT);
430432
prefilter.push_back(&PARAM_MIN_DIAG_SCORE);
431433
prefilter.push_back(&PARAM_TAXON_LIST);
432434
prefilter.push_back(&PARAM_INCLUDE_IDENTITY);
@@ -788,6 +790,7 @@ Parameters::Parameters():
788790
indexdb.push_back(&PARAM_MASK_RESIDUES);
789791
indexdb.push_back(&PARAM_MASK_PROBABILTY);
790792
indexdb.push_back(&PARAM_MASK_LOWER_CASE);
793+
indexdb.push_back(&PARAM_MASK_N_REPEAT);
791794
indexdb.push_back(&PARAM_SPACED_KMER_MODE);
792795
indexdb.push_back(&PARAM_SPACED_KMER_PATTERN);
793796
indexdb.push_back(&PARAM_S);
@@ -815,6 +818,7 @@ Parameters::Parameters():
815818
kmerindexdb.push_back(&PARAM_MASK_RESIDUES);
816819
kmerindexdb.push_back(&PARAM_MASK_PROBABILTY);
817820
kmerindexdb.push_back(&PARAM_MASK_LOWER_CASE);
821+
kmerindexdb.push_back(&PARAM_MASK_N_REPEAT);
818822
kmerindexdb.push_back(&PARAM_CHECK_COMPATIBLE);
819823
kmerindexdb.push_back(&PARAM_SEARCH_TYPE);
820824
kmerindexdb.push_back(&PARAM_SPACED_KMER_MODE);
@@ -992,6 +996,7 @@ Parameters::Parameters():
992996
kmermatcher.push_back(&PARAM_MASK_RESIDUES);
993997
kmermatcher.push_back(&PARAM_MASK_PROBABILTY);
994998
kmermatcher.push_back(&PARAM_MASK_LOWER_CASE);
999+
kmermatcher.push_back(&PARAM_MASK_N_REPEAT);
9951000
kmermatcher.push_back(&PARAM_COV_MODE);
9961001
kmermatcher.push_back(&PARAM_K);
9971002
kmermatcher.push_back(&PARAM_C);
@@ -1013,6 +1018,7 @@ Parameters::Parameters():
10131018
kmersearch.push_back(&PARAM_MASK_RESIDUES);
10141019
kmersearch.push_back(&PARAM_MASK_PROBABILTY);
10151020
kmersearch.push_back(&PARAM_MASK_LOWER_CASE);
1021+
kmersearch.push_back(&PARAM_MASK_N_REPEAT);
10161022
kmersearch.push_back(&PARAM_COV_MODE);
10171023
kmersearch.push_back(&PARAM_C);
10181024
kmersearch.push_back(&PARAM_MAX_SEQ_LEN);
@@ -2350,6 +2356,7 @@ void Parameters::setDefaults() {
23502356
maskMode = 1;
23512357
maskProb = 0.9;
23522358
maskLowerCaseMode = 0;
2359+
maskNrepeats = 0;
23532360
minDiagScoreThr = 15;
23542361
spacedKmer = true;
23552362
includeIdentity = false;

Diff for: src/commons/Parameters.h

+2
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ class Parameters {
413413
int maskMode; // mask low complex areas
414414
float maskProb; // mask probability
415415
int maskLowerCaseMode; // mask lowercase letters in prefilter and kmermatchers
416+
int maskNrepeats; // mask letters that occur at least N times in a row
416417

417418
int minDiagScoreThr; // min diagonal score
418419
int spacedKmer; // Spaced Kmers
@@ -754,6 +755,7 @@ class Parameters {
754755
PARAMETER(PARAM_MASK_RESIDUES)
755756
PARAMETER(PARAM_MASK_PROBABILTY)
756757
PARAMETER(PARAM_MASK_LOWER_CASE)
758+
PARAMETER(PARAM_MASK_N_REPEAT)
757759

758760
PARAMETER(PARAM_MIN_DIAG_SCORE)
759761
PARAMETER(PARAM_K_SCORE)

0 commit comments

Comments
 (0)