Skip to content

Commit

Permalink
Merge commit '15a2e1e842d6af8b4d2c6f12c8cd2cb2653bb7d4'
Browse files Browse the repository at this point in the history
  • Loading branch information
gamcil committed Jul 3, 2024
2 parents bd24ba5 + 15a2e1e commit 92b1690
Show file tree
Hide file tree
Showing 1,538 changed files with 512,282 additions and 441 deletions.
28 changes: 26 additions & 2 deletions lib/foldseek/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ project(foldseek C CXX)
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/lib/mmseqs/cmake")

set(ENABLE_PROSTT5 1 CACHE BOOL "Enable ProstT5")
set(ENABLE_CUDA 0 CACHE BOOL "Enable CUDA")
set(IGNORE_RUST_VERSION 0 CACHE BOOL "Ignore Rust version check")

if (NOT CMAKE_BUILD_TYPE)
Expand All @@ -31,8 +33,8 @@ include_directories(lib)
include_directories(lib/3di)
include_directories(lib/pulchra)
include_directories(lib/kerasify)
include_directories(lib/gemmi)

set(ENV{CARGO_NET_OFFLINE} true)
add_subdirectory(lib/corrosion)
# don't try to link to gcc_s, its not needed anyway
list(REMOVE_ITEM Rust_CARGO_TARGET_LINK_NATIVE_LIBS "gcc_s")
Expand Down Expand Up @@ -75,8 +77,30 @@ corrosion_import_crate(
)
include_directories(lib/block-aligner/c)
if(EMSCRIPTEN)
corrosion_add_target_local_rustflags(block-aligner-c "-Clink-args=--no-entry -sRELOCATABLE=1")
corrosion_add_target_local_rustflags(block_aligner_c "-Clink-args=--no-entry -sRELOCATABLE=1")
endif()

set(CANDLE_FEATURE "")
if(ENABLE_CUDA)
if(NOT DEFINED CUDAToolkit_ROOT)
message(FATAL_ERROR "please set -DCUDAToolkit_ROOT=path-to-cuda")
endif()
set(CANDLE_FEATURE cuda)
endif()

if(ENABLE_PROSTT5)
corrosion_import_crate(
MANIFEST_PATH lib/prostt5/c/Cargo.toml
CRATE_TYPES staticlib
FEATURES "${CANDLE_FEATURE}"
)
if(ENABLE_CUDA)
corrosion_set_env_vars(cprostt5 CUDA_ROOT=${CUDAToolkit_ROOT})
endif()
include_directories(lib/prostt5/c)
if(EMSCRIPTEN)
corrosion_add_target_local_rustflags(cprostt5 "-Clink-args=--no-entry -sRELOCATABLE=1")
endif()
endif()

add_subdirectory(src)
50 changes: 36 additions & 14 deletions lib/foldseek/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ Foldseek enables fast and sensitive comparisons of large protein structure sets.
<p align="center"><img src="https://github.com/steineggerlab/foldseek/blob/master/.github/foldseek.png" height="250"/></p>

## Publications
[van Kempen M, Kim S, Tumescheit C, Mirdita M, Lee J, Gilchrist C, Söding J, and Steinegger M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)](https://www.nature.com/articles/s41587-023-01773-0)
[van Kempen M, Kim S, Tumescheit C, Mirdita M, Lee J, Gilchrist CLM, Söding J, and Steinegger M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)](https://www.nature.com/articles/s41587-023-01773-0)

[Barrio-Hernandez I, Yeo J, Jänes J, Mirdita M, Gilchrist CLM, Wein T, Varadi M, Velankar S, Beltrao P and Steinegger M. Clustering predicted structures at the scale of the known protein universe. Nature, doi:10.1038/s41586-023-06510-w (2023)](https://www.nature.com/articles/s41586-023-06510-w)

[Kim W, Mirdita M, Levy Karin E, Gilchrist CLM, Schweke H, Söding J, Levy E, and Steinegger M. Rapid and Sensitive Protein Complex Alignment with Foldseek-Multimer. bioRxiv, doi:10.1101/2024.04.14.589414 (2024)](https://www.biorxiv.org/content/10.1101/2024.04.14.589414v1)

[Barrio-Hernandez I, Yeo J, Jänes J, Mirdita M, Gilchrist LMC, Wein T, Varadi M, Velankar S, Beltrao P and Steinegger M. Clustering predicted structures at the scale of the known protein universe. Nature, doi:10.1038/s41586-023-06510-w (2023)](https://www.nature.com/articles/s41586-023-06510-w)
# Table of Contents

- [Foldseek](#foldseek)
Expand All @@ -21,13 +24,14 @@ Foldseek enables fast and sensitive comparisons of large protein structure sets.
- [Output](#output-search)
- [Important Parameters](#important-search-parameters)
- [Alignment Mode](#alignment-mode)
- [Structure search from FASTA input](#structure-search-from-fasta-input)
- [Databases](#databases)
- [Create Custom Databases and Indexes](#create-custom-databases-and-indexes)
- [Cluster](#cluster)
- [Output](#output-cluster)
- [Important Parameters](#important-cluster-parameters)
- [Complexsearch](#complexsearch)
- [Output](#complex-search-output)
- [Multimer](#multimersearch)
- [Output](#multimer-search-output)
- [Main Modules](#main-modules)
- [Examples](#examples)

Expand Down Expand Up @@ -135,6 +139,24 @@ By default, Foldseek uses its local 3Di+AA structural alignment but it also supp

If alignment type is set to tmalign (`--alignment-type 1`), the results will be sorted by the TMscore normalized by query length. The TMscore is used for reporting two fields: the e-value=(qTMscore+tTMscore)/2 and the score=(qTMscore*100). All output fields (e.g., pident, fident, and alnlen) are calculated based on the TMalign alignment.

#### Structure search from FASTA input
Search by predicting 3Di directly from amino acid sequences without the need for existing protein structures.
This feature uses the [ProstT5](https://www.biorxiv.org/content/10.1101/2023.07.23.550085v2) protein language model and runs by default on CPU and is about 400-4000x compared to predicted structures by [ColabFold](https://github.com/sokrypton/ColabFold).

```
foldseek databases ProstT5 weights tmp
foldseek databases PDB pdb tmp
foldseek easy-search QUERY.fasta pdb result.m8 tmp --prostt5-model weights
```

Or create your a structural database from a fasta files.

```
foldseek createdb db.fasta db --prostt5-model weights
```

Faster inference using GPU/CUDA is also supported. Compile from source with `cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_CUDA=1 -DCUDAToolkit_ROOT=Path-To-Cuda-Toolkit` and call with `createdb/easy-search --prostt5-model weights --gpu 1`.

### Databases
The `databases` command downloads pre-generated databases like PDB or AlphaFoldDB.

Expand Down Expand Up @@ -219,26 +241,26 @@ MCAR...Q
| --lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr |


### Complexsearch
The `easy-complexsearch` module is designed for querying one or more protein complex (multi-chain) structures (supported input formats: PDB/mmCIF, flat or gzipped) against a target database of protein complex structures. It reports the similarity metrices between the complexes (e.g., the TMscore).
### Multimersearch
The `easy-multimersearch` module is designed for querying one or more protein complex (multi-chain) structures (supported input formats: PDB/mmCIF, flat or gzipped) against a target database of protein complex structures. It reports the similarity metrices between the complexes (e.g., the TMscore).

#### Using Complexsearch
#### Using Multimersearch
The examples below use files that can be found in the `example` directory, which is part of the Foldseek repo, if you clone it.
If you use the precompiled version of the software, you can download the files directly: [1tim.pdb.gz](https://github.com/steineggerlab/foldseek/raw/master/example/1tim.pdb.gz) and [8tim.pdb.gz](https://github.com/steineggerlab/foldseek/raw/master/example/8tim.pdb.gz).

For a pairwise alignment of complexes using `easy-complexsearch`, run the following command:
For a pairwise alignment of complexes using `easy-multimersearch`, run the following command:
```
foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmpFolder
foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmpFolder
```
Foldseek `easy-complexsearch` can also be used for searching one or more query complexes against a target database:
Foldseek `easy-multimersearch` can also be used for searching one or more query complexes against a target database:
```
foldseek databases PDB pdb tmp
foldseek easy-complexsearch example/1tim.pdb.gz pdb result tmpFolder
foldseek easy-multimersearch example/1tim.pdb.gz pdb result tmpFolder
```

#### Complex Search Output
#### Multimer Search Output
##### Tab-separated-complex
By default, `easy-complexsearch` reports the output alignment in a tab-separated file.
By default, `easy-multimersearch` reports the output alignment in a tab-separated file.
The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,complexassignid` but they can be customized with the `--format-output` option e.g., `--format-output "query,target,complexqtmscore,complexttmscore,complexassignid"` alters the output to show specific scores and identifiers.

| Code | Description |
Expand All @@ -260,7 +282,7 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta
```

##### Complex Report
`easy-complexsearch` also generates a report (prefixed `_report`), which provides a summary of the inter-complex chain matching, including identifiers, chains, TMscores, rotation matrices, translation vectors, and assignment IDs. The report includes the following fields:
`easy-multimersearch` also generates a report (prefixed `_report`), which provides a summary of the inter-complex chain matching, including identifiers, chains, TMscores, rotation matrices, translation vectors, and assignment IDs. The report includes the following fields:
| Column | Description |
| --- | --- |
| 1 | Identifier of the query complex |
Expand Down
4 changes: 2 additions & 2 deletions lib/foldseek/data/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ set(COMPILED_RESOURCES
evalue_nn.kerasify
main.js
vendor.js.zst
complexsearch.sh
easycomplexsearch.sh
multimersearch.sh
easymultimersearch.sh
)

set(GENERATED_OUTPUT_HEADERS "")
Expand Down
46 changes: 0 additions & 46 deletions lib/foldseek/data/complexsearch.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,25 @@ if notExists "${TARGET}.dbtype"; then
TARGET="${TMP_PATH}/target"
fi

if notExists "${TMP_PATH}/complex_result.dbtype"; then
if notExists "${TMP_PATH}/multimer_result.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" complexsearch "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \
|| fail "ComplexSearch died"
"$MMSEQS" multimersearch "${QUERY}" "${TARGET}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \
|| fail "multimersearch died"
fi

# shellcheck disable=SC2086
"$MMSEQS" convertalis "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${OUTPUT}" ${CONVERT_PAR} \
"$MMSEQS" convertalis "${QUERY}" "${TARGET}" "${TMP_PATH}/multimer_result" "${OUTPUT}" ${CONVERT_PAR} \
|| fail "Convert Alignments died"

if [ -z "${NO_REPORT}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" createcomplexreport "${QUERY}" "${TARGET}" "${TMP_PATH}/complex_result" "${OUTPUT}_report" ${REPORT_PAR} \
|| fail "createcomplexreport died"
"$MMSEQS" createmultimerreport "${QUERY}" "${TARGET}" "${TMP_PATH}/multimer_result" "${OUTPUT}_report" ${REPORT_PAR} \
|| fail "createmultimerreport died"
fi

if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY}
fi
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY}
"$MMSEQS" rmdb "${TMP_PATH}/multimer_result" ${VERBOSITY}
if [ -z "${LEAVE_INPUT}" ]; then
if [ -f "${TMP_PATH}/target" ]; then
# shellcheck disable=SC2086
Expand All @@ -71,6 +65,6 @@ if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/query_ss" ${VERBOSITY}
fi
rm -rf "${TMP_PATH}/complexsearch_tmp"
rm -f "${TMP_PATH}/easycomplexsearch.sh"
rm -rf "${TMP_PATH}/multimersearch_tmp"
rm -f "${TMP_PATH}/easymultimersearch.sh"
fi
2 changes: 1 addition & 1 deletion lib/foldseek/data/easystructuresearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ notExists() {
if notExists "${1}.dbtype"; then
if notExists "${TMP_PATH}/query.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_PAR} \
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \
|| fail "query createdb died"
fi
QUERY="${TMP_PATH}/query"
Expand Down
58 changes: 58 additions & 0 deletions lib/foldseek/data/multimersearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}

if notExists "${TMP_PATH}/result.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" search "${QUERYDB}" "${TARGETDB}" "${TMP_PATH}/result" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \
|| fail "Search died"
fi

RESULT="${TMP_PATH}/result"
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
if notExists "${TMP_PATH}/result_expand_pref.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" expandmultimer "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${RESULT}_expand_pref" ${THREADS_PAR} \
|| fail "expandmultimer died"
fi
if notExists "${TMP_PATH}/result_expand_aligned.dbtype"; then
if [ "$MULTIMER_ALIGNMENT_ALGO" = "tmalign" ]; then
# # shellcheck disable=SC2086
# "$MMSEQS" structurealign "${QUERYDB}" "${TARGETDB}" "${RESULT}_expand_pref" "${RESULT}_expand_aligned_tmp" ${MULTIMER_ALIGN_PREF_PAR} \
# || fail $MULTIMER_ALIGNMENT_ALGO "died"
# shellcheck disable=SC2086
"$MMSEQS" structurealign "${QUERYDB}" "${TARGETDB}" "${RESULT}_expand_pref" "${RESULT}_expand_aligned_tmp" -e 100 ${THREADS_PAR} \
|| fail $MULTIMER_ALIGNMENT_ALGO "died"
# shellcheck disable=SC2086
"$MMSEQS" tmalign "${QUERYDB}" "${TARGETDB}" "${RESULT}_expand_aligned_tmp" "${RESULT}_expand_aligned" ${MULTIMER_ALIGN_PAR} \
|| fail $MULTIMER_ALIGNMENT_ALGO "died"
else
# shellcheck disable=SC2086
"$MMSEQS" $MULTIMER_ALIGNMENT_ALGO "${QUERYDB}" "${TARGETDB}" "${RESULT}_expand_pref" "${RESULT}_expand_aligned" ${MULTIMER_ALIGN_PAR} \
|| fail $MULTIMER_ALIGNMENT_ALGO "died"
fi
fi
RESULT="${TMP_PATH}/result_expand_aligned"
fi
if notExists "${OUTPUT}.dbtype"; then
# shellcheck disable=SC2086
$MMSEQS scoremultimer "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${OUTPUT}" ${SCOREMULTIMER_PAR} \
|| fail "scoremultimer died"
fi

if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY}
fi
rm -rf "${TMP_PATH}/search_tmp"
rm -f "${TMP_PATH}/multimersearch.sh"
fi
Loading

0 comments on commit 92b1690

Please sign in to comment.