Skip to content

Commit

Permalink
Merge commit '9d4204bf5ccb964f2e37e1a2c6b9f48ef9a70cd0'
Browse files Browse the repository at this point in the history
  • Loading branch information
gamcil committed Mar 4, 2024
2 parents 77dc74a + 9d4204b commit 57a69be
Show file tree
Hide file tree
Showing 41 changed files with 1,439 additions and 523 deletions.
87 changes: 40 additions & 47 deletions lib/foldseek/README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/foldseek/data/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set(COMPILED_RESOURCES
evalue_nn.kerasify
main.js
vendor.js.zst
complexsearch.sh
easycomplexsearch.sh
)

Expand Down
46 changes: 46 additions & 0 deletions lib/foldseek/data/complexsearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}

if notExists "${TMP_PATH}/result.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" search "${QUERYDB}" "${TARGETDB}" "${TMP_PATH}/result" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \
|| fail "Search died"
fi

RESULT="${TMP_PATH}/result"
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
if notExists "${TMP_PATH}/result_expand_pref.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" expandcomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${TMP_PATH}/result_expand_pref" ${THREADS_PAR} \
|| fail "Expandcomplex died"
fi
if notExists "${TMP_PATH}/result_expand_aligned.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" $COMPLEX_ALIGNMENT_ALGO "${QUERYDB}" "${TARGETDB}" "${TMP_PATH}/result_expand_pref" "${TMP_PATH}/result_expand_aligned" ${COMPLEX_ALIGN_PAR} \
|| fail $COMPLEX_ALIGNMENT_ALGO "died"
fi
RESULT="${TMP_PATH}/result_expand_aligned"
fi
if notExists "${TMP_PATH}/complex_result.dbtype"; then
# shellcheck disable=SC2086
$MMSEQS scorecomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${OUTPUT}" ${SCORECOMPLEX_PAR} \
|| fail "ScoreComplex died"
fi

if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY}
fi
rm -rf "${TMP_PATH}/search_tmp"
rm -f "${TMP_PATH}/complexsearch.sh"
fi
44 changes: 18 additions & 26 deletions lib/foldseek/data/easycomplexsearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,39 +26,31 @@ if notExists "${TARGET}.dbtype"; then
TARGET="${TMP_PATH}/target"
fi


SEARCH_RESULT="${TMP_PATH}/result"
if notExists "${SEARCH_RESULT}.dbtype"; then
if notExists "${TMP_PATH}/complex_result.dbtype"; then
# shellcheck disable=SC2086

"$MMSEQS" search "${QUERY}" "${TARGET}" "${SEARCH_RESULT}" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \
|| fail "Search died"
"$MMSEQS" complexsearch "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \
|| fail "ComplexSearch died"
fi

SCORECOMPLEX_RESULT="${TMP_PATH}/result2"
if notExists "${SCORECOMPLEX_RESULT}/.dbtype"; then
# shellcheck disable=SC2086
$MMSEQS scorecomplex "${QUERY}" "${TARGET}" "${SEARCH_RESULT}" ${SCORECOMPLEX_RESULT} ${SCORECOMPLEX_PAR} \
|| fail "ScoreComplex died"
fi
# shellcheck disable=SC2086
"$MMSEQS" convertalis "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${OUTPUT}" ${CONVERT_PAR} \
|| fail "Convert Alignments died"

if notExists "${TMP_PATH}/alis.dbtype"; then
if [ -z "${NO_REPORT}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" convertalis "${QUERY}" "${TARGET}" "${SCORECOMPLEX_RESULT}" "${OUTPUT}" ${CONVERT_PAR} \
|| fail "Convert Alignments died"
"$MMSEQS" createcomplexreport "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${OUTPUT}_report" ${REPORT_PAR} \
|| fail "createcomplexreport died"
fi
# shellcheck disable=SC2086
"$MMSEQS" createcomplexreport "${QUERY}" "${TARGET}" "${SCORECOMPLEX_RESULT}" "${REPORT}" ${REPORT_PAR}\
|| fail "Createcomplexreport dies"






if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY}
fi
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY}
if [ -z "${LEAVE_INPUT}" ]; then
if [ -f "${TMP_PATH}/target" ]; then
# shellcheck disable=SC2086
Expand All @@ -79,6 +71,6 @@ if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/query_ss" ${VERBOSITY}
fi
rm -rf "${TMP_PATH}/search_tmp"
rm -f "${TMP_PATH}/easyscorecomplex.sh"
fi
rm -rf "${TMP_PATH}/complexsearch_tmp"
rm -f "${TMP_PATH}/easycomplexsearch.sh"
fi
6 changes: 3 additions & 3 deletions lib/foldseek/data/main.js

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion lib/foldseek/data/structdatabases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,17 @@ case "${SELECTION}" in
push_back "${TMP_PATH}/pdb"
INPUT_TYPE="FOLDSEEK_DB"
;;
"CATH50")
if notExists "${TMP_PATH}/cath50.tar.gz"; then
downloadFile "https://foldseek.steineggerlab.workers.dev/cath50.tar.gz" "${TMP_PATH}/cath50.tar.gz"
downloadFile "https://foldseek.steineggerlab.workers.dev/cath50.version" "${TMP_PATH}/version"
fi
tar xvfz "${TMP_PATH}/cath50.tar.gz" -C "${TMP_PATH}"
push_back "${TMP_PATH}/cath50"
INPUT_TYPE="FOLDSEEK_DB"
;;
esac


if notExists "${OUTDB}.dbtype"; then
case "${INPUT_TYPE}" in
"FOLDSEEK_DB")
Expand Down
Binary file modified lib/foldseek/data/vendor.js.zst
Binary file not shown.
1 change: 1 addition & 0 deletions lib/foldseek/lib/mmseqs/src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ extern int convertkb(int argc, const char **argv, const Command& command);
extern int convertmsa(int argc, const char **argv, const Command& command);
extern int convertprofiledb(int argc, const char **argv, const Command& command);
extern int createdb(int argc, const char **argv, const Command& command);
extern int makepaddedseqdb(int argc, const char **argv, const Command& command);
extern int createindex(int argc, const char **argv, const Command& command);
extern int createlinindex(int argc, const char **argv, const Command& command);
extern int createseqfiledb(int argc, const char **argv, const Command& command);
Expand Down
21 changes: 14 additions & 7 deletions lib/foldseek/lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ std::vector<Command> baseCommands = {
"Slower, sensitive clustering",
"mmseqs easy-cluster examples/DB.fasta result tmp\n"
"# Cluster output\n"
"# - result_rep_seq.fasta: Representatives\n"
"# - result_all_seq.fasta: FASTA-like per cluster\n"
"# - result_cluster.tsv: Adjacency list\n\n"
"# - result_rep_seq.fasta: Representatives\n"
"# - result_all_seqs.fasta: FASTA-like per cluster\n"
"# - result_cluster.tsv: Adjacency list\n\n"
"# Important parameter: --min-seq-id, --cov-mode and -c \n"
"# --cov-mode \n"
"# 0 1 2\n"
Expand All @@ -62,9 +62,9 @@ std::vector<Command> baseCommands = {
"Fast linear time cluster, less sensitive clustering",
"mmseqs easy-linclust examples/DB.fasta result tmp\n\n"
"# Linclust output\n"
"# - result_rep_seq.fasta: Representatives\n"
"# - result_all_seq.fasta: FASTA-like per cluster\n"
"# - result_cluster.tsv: Adjecency list\n\n"
"# - result_rep_seq.fasta: Representatives\n"
"# - result_all_seqs.fasta: FASTA-like per cluster\n"
"# - result_cluster.tsv: Adjecency list\n\n"
"# Important parameter: --min-seq-id, --cov-mode and -c \n"
"# --cov-mode \n"
"# 0 1 2\n"
Expand Down Expand Up @@ -130,14 +130,21 @@ std::vector<Command> baseCommands = {
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
{"makepaddedseqdb", makepaddedseqdb, &par.onlyverbosity, COMMAND_HIDDEN,
"Generate a padded sequence DB",
"Generate a padded sequence DB",
"Martin Steinegger <[email protected]>",
"<i:sequenceDB> <o:sequenceDB>",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
{"sequenceIndexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
{"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN,
NULL,
NULL,
"Milot Mirdita <[email protected]>",
"<i:DB1> ... <i:DBN> <o:DB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb },
{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
{"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN,
{"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN,
NULL,
NULL,
"Martin Steinegger <[email protected]>",
Expand Down
3 changes: 3 additions & 0 deletions lib/foldseek/lib/mmseqs/src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2573,6 +2573,9 @@ void Parameters::setDefaults() {
taxonomySearchMode = Parameters::TAXONOMY_APPROX_2BLCA;
taxonomyOutputMode = Parameters::TAXONOMY_OUTPUT_LCA;

// help
help = 0;

// substituion matrix
substitutionMatrices = {
{"nucleotide.out", nucleotide_out, nucleotide_out_len },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ CacheFriendlyOperations<BINSIZE>::~CacheFriendlyOperations<BINSIZE>(){

template<unsigned int BINSIZE>
size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(IndexEntryLocal **input, CounterResult *output,
size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore) {
size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore) {
do {
setupBinPointer();
CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
Expand All @@ -58,12 +58,16 @@ size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByScore(CounterResult *inp
}

template<unsigned int BINSIZE>
size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N) {
size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N, const bool keepScoredHits) {
do {
setupBinPointer();
hashElements(inputOutputArray, N);
} while(checkForOverflowAndResizeArray(false) == true); // overflowed occurred
return mergeDiagonalDuplicates(inputOutputArray);
if(keepScoredHits){
return mergeDiagonalKeepScoredHitsDuplicates(inputOutputArray);
}else{
return mergeDiagonalDuplicates(inputOutputArray);
}
}

template<unsigned int BINSIZE>
Expand Down Expand Up @@ -93,6 +97,7 @@ size_t CacheFriendlyOperations<BINSIZE>::mergeDiagonalDuplicates(CounterResult *
--n;
}
// combine diagonals
// we keep only the last diagonal element
for (size_t n = 0; n < currBinSize; n++) {
const CounterResult &element = binStartPos[n];
const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
Expand All @@ -109,6 +114,40 @@ size_t CacheFriendlyOperations<BINSIZE>::mergeDiagonalDuplicates(CounterResult *
return doubleElementCount;
}


template<unsigned int BINSIZE>
size_t CacheFriendlyOperations<BINSIZE>::mergeDiagonalKeepScoredHitsDuplicates(CounterResult *output) {
size_t doubleElementCount = 0;
const CounterResult *bin_ref_pointer = binDataFrame;
// duplicateBitArray is already zero'd from findDuplicates

for (size_t bin = 0; bin < BINCOUNT; bin++) {
const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
const size_t currBinSize = (bins[bin] - binStartPos);
// write diagonals + 1 in reverse order in the byte array
for (size_t n = 0; n < currBinSize; n++) {
const unsigned int element = binStartPos[n].id >> (MASK_0_5_BIT);
duplicateBitArray[element] = static_cast<unsigned char>(binStartPos[n].diagonal) + 1;
}
// combine diagonals
// we keep only the last diagonal element
size_t n = currBinSize - 1;
while (n != static_cast<size_t>(-1)) {
const CounterResult &element = binStartPos[n];
const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
output[doubleElementCount].id = element.id;
output[doubleElementCount].count = element.count;
output[doubleElementCount].diagonal = element.diagonal;
// std::cout << output[doubleElementCount].id << " " << (int)output[doubleElementCount].count << " " << (int)static_cast<unsigned char>(output[doubleElementCount].diagonal) << std::endl;
// memory overflow can not happen since input array = output array
doubleElementCount += (output[doubleElementCount].count != 0 || duplicateBitArray[hashBinElement] != static_cast<unsigned char>(binStartPos[n].diagonal)) ? 1 : 0;
duplicateBitArray[hashBinElement] = static_cast<unsigned char>(element.diagonal);
--n;
}
}
return doubleElementCount;
}

template<unsigned int BINSIZE>
size_t CacheFriendlyOperations<BINSIZE>::mergeScoreDuplicates(CounterResult *output) {
size_t doubleElementCount = 0;
Expand Down Expand Up @@ -211,12 +250,12 @@ size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(CounterResult *output, s
output[doubleElementCount].id = element;
output[doubleElementCount].count = 0;
output[doubleElementCount].diagonal = tmpElementBuffer[n].diagonal;
// const unsigned char diagonal = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
// const unsigned char diagonal = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
// memory overflow can not happen since input array = output array
// if(duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal){
// std::cout << "seq="<< output[doubleElementCount].id << "\tDiag=" << (int) output[doubleElementCount].diagonal
// << " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl;
// }
// if(duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal){
// std::cout << "seq="<< output[doubleElementCount].id << "\tDiag=" << (int) output[doubleElementCount].diagonal
// << " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl;
// }
doubleElementCount += (duplicateBitArray[hashBinElement] != static_cast<unsigned char>(tmpElementBuffer[n].diagonal)) ? 1 : 0;
duplicateBitArray[hashBinElement] = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class CacheFriendlyOperations {
size_t mergeElementsByScore(CounterResult *inputOutputArray, const size_t N);

// merge elements in CounterResult by diagonal, combines elements with same ids that occur after each other
size_t mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N);
size_t mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N, const bool keepScoredHits = false);

size_t keepMaxScoreElementOnly(CounterResult *inputOutputArray, const size_t N);

Expand Down Expand Up @@ -124,6 +124,8 @@ class CacheFriendlyOperations {

size_t mergeDiagonalDuplicates(CounterResult *output);

size_t mergeDiagonalKeepScoredHitsDuplicates(CounterResult *output);

size_t keepMaxElement(CounterResult *output);
};

Expand Down
Loading

0 comments on commit 57a69be

Please sign in to comment.