Skip to content

Commit 93ef371

Browse files
committed
error fix and undate regression results
1 parent 648c635 commit 93ef371

8 files changed

+92
-20
lines changed

Diff for: regression/run_cluster.sh

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/sh -e
2+
3+
CLUDB="${RESULTS}/clu"
4+
"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${CLUDB}" --shuffle 0
5+
6+
"${MMSEQS}" cluster "${CLUDB}" "$RESULTS/results_clu" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 --cluster-steps 3
7+
"${MMSEQS}" createtsv "${CLUDB}" "${CLUDB}" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
8+
9+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
10+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
11+
TARGET="15691"
12+
awk -v actual="$ACTUAL" -v target="$TARGET" \
13+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
14+
> "${RESULTS}.report"

Diff for: regression/run_cluster_update.sh

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/sh -e
2+
SEQCLUDB1="${RESULTS}/clu1"
3+
SEQCLUDB2="${RESULTS}/clu2"
4+
awk 'NR%4==1 || NR%4==2{print}' "${DATADIR}/clu.fasta" > "$RESULTS/clu1.fasta"
5+
awk 'NR%4==3 || NR%4==0{print}' "${DATADIR}/clu.fasta" > "$RESULTS/clu2.fasta"
6+
head -n 2 "$RESULTS/clu1.fasta" >> "$RESULTS/clu2.fasta"
7+
cat "$RESULTS/clu1.fasta" "$RESULTS/clu2.fasta" > "$RESULTS/cluCombined.fasta"
8+
9+
"${MMSEQS}" createdb "$RESULTS/clu1.fasta" "${SEQCLUDB1}"
10+
"${MMSEQS}" createdb "$RESULTS/cluCombined.fasta" "${SEQCLUDB2}"
11+
12+
"${MMSEQS}" linclust "${SEQCLUDB1}" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 -a -c 0.50 --min-seq-id 0.50
13+
"${MMSEQS}" clusterupdate "${SEQCLUDB1}" "${SEQCLUDB2}" "$RESULTS/results_clu" "$RESULTS/seqdb_update" "$RESULTS/clu_updated" "$RESULTS/tmp" --cov-mode 1 -c 0.50 --min-seq-id 0.50
14+
"${MMSEQS}" createtsv "$RESULTS/seqdb_update" "$RESULTS/seqdb_update" "$RESULTS/clu_updated" "$RESULTS/clu_updated.tsv"
15+
16+
CLUSTERMEMEBER=$(wc -l "$RESULTS/clu_updated.tsv" | awk '{print $1}')
17+
CLUSTER=$(echo $(cut -f1 "$RESULTS/clu_updated.tsv" | sort -u | wc -l))
18+
UPDATEDSEQCNT=$(wc -l "$RESULTS/seqdb_update.index" | awk '{print $1}')
19+
20+
TARGET="32132 24733 32132"
21+
ACTUAL="$CLUSTERMEMEBER $CLUSTER $UPDATEDSEQCNT"
22+
awk -v actual="$ACTUAL" -v target="$TARGET" 'BEGIN { print (actual == target) ? "GOOD" : "BAD"; \
23+
print "Expected: ", target; \
24+
print "Actual: ", actual; }' > "${RESULTS}.report"

Diff for: regression/run_easy_cluster.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/sh -e
2+
"${MMSEQS}" easy-cluster "${DATADIR}/clu.fasta" "$RESULTS/results" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 --cluster-steps 3
3+
4+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
5+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
6+
TARGET="15691"
7+
awk -v actual="$ACTUAL" -v target="$TARGET" \
8+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
9+
> "${RESULTS}.report"

Diff for: regression/run_easy_cluster_reassign.sh

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/sh -e
2+
3+
cat "${DATADIR}/clu.fasta" | "${MMSEQS}" easy-cluster stdin "$RESULTS/results" "$RESULTS/tmp" --min-seq-id 0.3 -s 2 -c 0.8 --cov-mode 1 --cluster-reassign 1
4+
5+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
6+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
7+
TARGET="17231"
8+
awk -v actual="$ACTUAL" -v target="$TARGET" \
9+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
10+
> "${RESULTS}.report"

Diff for: regression/run_easy_linclust.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/sh -e
2+
"${MMSEQS}" easy-linclust "${DATADIR}/clu.fasta" "$RESULTS/results" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50
3+
4+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
5+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
6+
TARGET="26146"
7+
awk -v actual="$ACTUAL" -v target="$TARGET" \
8+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
9+
> "${RESULTS}.report"

Diff for: regression/run_linclust.sh

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/sh -e
2+
CLUDB=
3+
"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${RESULTS}/clu"
4+
5+
"${MMSEQS}" linclust "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50
6+
"${MMSEQS}" createtsv "${RESULTS}/clu" "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
7+
8+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
9+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
10+
TARGET="26135"
11+
awk -v actual="$ACTUAL" -v target="$TARGET" \
12+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
13+
> "${RESULTS}.report"

Diff for: regression/run_linclust_split.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/sh -e
2+
"${MMSEQS}" createdb "${DATADIR}/clu.fasta" "${RESULTS}/clu"
3+
4+
"${MMSEQS}" linclust "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/tmp" --cov-mode 1 --cluster-mode 0 -c 0.90 --min-seq-id 0.50 --split-memory-limit 10M
5+
"${MMSEQS}" createtsv "${RESULTS}/clu" "${RESULTS}/clu" "$RESULTS/results_clu" "$RESULTS/results_cluster.tsv"
6+
7+
awk 'BEGIN { l = "" } l != $1 { l = $1; cnt++; } { t++; } END { print cnt"\t"t"\t"(t/cnt) }' "$RESULTS/results_cluster.tsv" > "$RESULTS/results_summary.tsv"
8+
ACTUAL="$(cut -f1 "$RESULTS/results_summary.tsv")"
9+
TARGET="26135"
10+
awk -v actual="$ACTUAL" -v target="$TARGET" \
11+
'BEGIN { print (actual == target) ? "GOOD" : "BAD"; print "Expected: ", target; print "Actual: ", actual; }' \
12+
> "${RESULTS}.report"

Diff for: src/linclust/kmermatcher.cpp

+1-20
Original file line numberDiff line numberDiff line change
@@ -54,26 +54,6 @@ KmerPosition<T, IncludeAdjacentSeq> *initKmerPositionMemory(size_t size) {
5454
return hashSeqPair;
5555
}
5656

57-
void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix){
58-
if (maskMode == 1) {
59-
tantan::maskSequences((char*)seq.numSequence,
60-
(char*)(seq.numSequence + seq.L),
61-
50 /*options.maxCycleLength*/,
62-
probMatrix->probMatrixPointers,
63-
0.005 /*options.repeatProb*/,
64-
0.05 /*options.repeatEndProb*/,
65-
0.5 /*options.repeatOffsetProbDecay*/,
66-
0, 0,
67-
maskProb /*options.minMaskProb*/, probMatrix->hardMaskTable);
68-
}
69-
if(maskLowerCase == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
70-
Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
71-
const char * charSeq = seq.getSeqData();
72-
for (int i = 0; i < seq.L; i++) {
73-
seq.numSequence[i] = (islower(charSeq[i])) ? maskLetter : seq.numSequence[i];
74-
}
75-
}
76-
}
7757

7858
template <int TYPE, typename T, bool IncludeAdjacentSeq>
7959
std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentSeq> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){
@@ -1039,6 +1019,7 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
10391019
}
10401020
MPI_Barrier(MPI_COMM_WORLD);
10411021
if(mpiRank == 0){
1022+
std::string splitBufferName;
10421023
for(size_t split = 0; split < splits; split++) {
10431024
std::string splitFileName = par.db2 + "_split_" +SSTR(split);
10441025
splitFiles.push_back(splitFileName);

0 commit comments

Comments
 (0)