error fix and undate regression results

ChunShow · ChunShow · commit 93ef3710c3be · 2025-01-22T12:00:55.000+09:00
diff --git a/regression/run_cluster.sh b/regression/run_cluster.sh
@@ -0,0 +1,14 @@
+#!/bin/sh -e
+
+CLUDB="${RESULTS}/clu"
+"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${CLUDB}" --shuffle 0
+
+"${MMSEQS}" cluster "${CLUDB}" "$RESULTS/results_clu" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 --cluster-steps 3
+"${MMSEQS}" createtsv "${CLUDB}" "${CLUDB}" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="15691"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/regression/run_cluster_update.sh b/regression/run_cluster_update.sh
@@ -0,0 +1,24 @@
+#!/bin/sh -e
+SEQCLUDB1="${RESULTS}/clu1"
+SEQCLUDB2="${RESULTS}/clu2"
+awk 'NR%4==1 || NR%4==2{print}' "${DATADIR}/clu.fasta" > "$RESULTS/clu1.fasta"
+awk 'NR%4==3 || NR%4==0{print}' "${DATADIR}/clu.fasta" > "$RESULTS/clu2.fasta"
+head -n 2 "$RESULTS/clu1.fasta" >> "$RESULTS/clu2.fasta"
+cat "$RESULTS/clu1.fasta" "$RESULTS/clu2.fasta" > "$RESULTS/cluCombined.fasta"
+
+"${MMSEQS}" createdb "$RESULTS/clu1.fasta" "${SEQCLUDB1}"
+"${MMSEQS}" createdb "$RESULTS/cluCombined.fasta" "${SEQCLUDB2}"
+
+"${MMSEQS}" linclust "${SEQCLUDB1}" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 -a -c 0.50 --min-seq-id 0.50
+"${MMSEQS}" clusterupdate "${SEQCLUDB1}" "${SEQCLUDB2}" "$RESULTS/results_clu" "$RESULTS/seqdb_update" "$RESULTS/clu_updated" "$RESULTS/tmp" --cov-mode 1 -c 0.50 --min-seq-id 0.50
+"${MMSEQS}" createtsv "$RESULTS/seqdb_update" "$RESULTS/seqdb_update" "$RESULTS/clu_updated" "$RESULTS/clu_updated.tsv"
+
+CLUSTERMEMEBER=$(wc -l "$RESULTS/clu_updated.tsv" | awk '{print $1}')
+CLUSTER=$(echo $(cut -f1 "$RESULTS/clu_updated.tsv" | sort -u | wc -l))
+UPDATEDSEQCNT=$(wc -l "$RESULTS/seqdb_update.index" | awk '{print $1}')
+
+TARGET="32132 24733 32132"
+ACTUAL="$CLUSTERMEMEBER $CLUSTER $UPDATEDSEQCNT"
+awk -v actual="$ACTUAL" -v target="$TARGET" 'BEGIN { print (actual == target) ? "GOOD" : "BAD"; \
+    print "Expected: ", target; \
+    print "Actual:   ", actual; }' > "${RESULTS}.report"
diff --git a/regression/run_easy_cluster.sh b/regression/run_easy_cluster.sh
@@ -0,0 +1,9 @@
+#!/bin/sh -e
+"${MMSEQS}" easy-cluster "${DATADIR}/clu.fasta" "$RESULTS/results" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 --cluster-steps 3
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="15691"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/regression/run_easy_cluster_reassign.sh b/regression/run_easy_cluster_reassign.sh
@@ -0,0 +1,10 @@
+#!/bin/sh -e
+
+cat  "${DATADIR}/clu.fasta" | "${MMSEQS}" easy-cluster stdin "$RESULTS/results" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 -c 0.8 --cov-mode 1 --cluster-reassign 1
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="17231"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/regression/run_easy_linclust.sh b/regression/run_easy_linclust.sh
@@ -0,0 +1,9 @@
+#!/bin/sh -e
+"${MMSEQS}" easy-linclust "${DATADIR}/clu.fasta" "$RESULTS/results" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="26146"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/regression/run_linclust.sh b/regression/run_linclust.sh
@@ -0,0 +1,13 @@
+#!/bin/sh -e
+CLUDB=
+"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${RESULTS}/clu"
+
+"${MMSEQS}" linclust "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50
+"${MMSEQS}" createtsv "${RESULTS}/clu" "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="26135"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/regression/run_linclust_split.sh b/regression/run_linclust_split.sh
@@ -0,0 +1,12 @@
+#!/bin/sh -e
+"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${RESULTS}/clu"
+
+"${MMSEQS}" linclust "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50 --split-memory-limit 10M
+"${MMSEQS}" createtsv "${RESULTS}/clu" "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
+
+awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
+ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
+TARGET="26135"
+awk -v actual="$ACTUAL" -v target="$TARGET" \
+    'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
+    > "${RESULTS}.report"
diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp
@@ -54,26 +54,6 @@ KmerPosition<T, IncludeAdjacentSeq> *initKmerPositionMemory(size_t size) {
     return hashSeqPair;
 }
 
-void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix){
-    if (maskMode == 1) {
-        tantan::maskSequences((char*)seq.numSequence,
-                              (char*)(seq.numSequence + seq.L),
-                              50 /*options.maxCycleLength*/,
-                              probMatrix->probMatrixPointers,
-                              0.005 /*options.repeatProb*/,
-                              0.05 /*options.repeatEndProb*/,
-                              0.5 /*options.repeatOffsetProbDecay*/,
-                              0, 0,
-                              maskProb /*options.minMaskProb*/, probMatrix->hardMaskTable);
-    }
-    if(maskLowerCase == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
-                              Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
-        const char * charSeq = seq.getSeqData();
-        for (int i = 0; i < seq.L; i++) {
-            seq.numSequence[i] = (islower(charSeq[i])) ? maskLetter : seq.numSequence[i];
-        }
-    }
-}
 
 template <int TYPE, typename T, bool IncludeAdjacentSeq>
 std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentSeq> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){
@@ -1039,6 +1019,7 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
     if(mpiRank == 0){
+        std::string splitBufferName;
         for(size_t split = 0; split < splits; split++) {
             std::string splitFileName = par.db2 + "_split_" +SSTR(split);
             splitFiles.push_back(splitFileName);