Skip to content

Commit a804518

Browse files
committed
make full frame mode available through --translation-mode
1 parent 964bfaa commit a804518

File tree

7 files changed

+94
-137
lines changed

7 files changed

+94
-137
lines changed

data/workflow/createindex.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,15 @@ INPUT="$1"
1818
if [ -n "$TRANSLATED" ]; then
1919
# 1. extract orf
2020
if notExists "$2/orfs_aa.dbtype"; then
21-
# shellcheck disable=SC2086
22-
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
23-
|| fail "extractorfs died"
21+
if [ -n "$ORF_SKIP" ]; then
22+
# shellcheck disable=SC2086
23+
"$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \
24+
|| fail "extractframes died"
25+
else
26+
# shellcheck disable=SC2086
27+
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
28+
|| fail "extractorfs died"
29+
fi
2430
fi
2531

2632
# shellcheck disable=SC2086
@@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then
3339
rm -f "$2/createindex.sh"
3440
fi
3541
elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then
36-
# 1. extract orf
42+
# 1. extract orf
3743
if notExists "$2/nucl_split_seq.dbtype"; then
3844
# shellcheck disable=SC2086
3945
"$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \

data/workflow/translated_search.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ TMP_PATH="$4"
2121
QUERY="$1"
2222
QUERY_ORF="$1"
2323
if [ -n "$QUERY_NUCL" ]; then
24-
if [ "$ORF_SKIP" = "TRUE" ]; then
24+
if [ -n "$ORF_SKIP" ]; then
2525
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
2626
# shellcheck disable=SC2086
2727
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
@@ -42,7 +42,7 @@ TARGET="$2"
4242
TARGET_ORF="$2"
4343
if [ -n "$TARGET_NUCL" ]; then
4444
if [ -n "$NO_TARGET_INDEX" ]; then
45-
if [ "$ORF_SKIP" = "TRUE" ]; then
45+
if [ -n "$ORF_SKIP" ]; then
4646
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
4747
# shellcheck disable=SC2086
4848
"$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \

src/commons/Parameters.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ Parameters::Parameters():
172172
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
173173
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
174174
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
175-
PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""),
175+
PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"),
176176
// easysearch
177177
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
178178
// extractorfs
@@ -1271,7 +1271,7 @@ Parameters::Parameters():
12711271
searchworkflow.push_back(&PARAM_RUNNER);
12721272
searchworkflow.push_back(&PARAM_REUSELATEST);
12731273
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
1274-
searchworkflow.push_back(&PARAM_ORF_SKIP);
1274+
searchworkflow.push_back(&PARAM_TRANSLATION_MODE);
12751275
12761276
linsearchworkflow = combineList(align, kmersearch);
12771277
linsearchworkflow = combineList(linsearchworkflow, swapresult);
@@ -1294,8 +1294,9 @@ Parameters::Parameters():
12941294
12951295
// createindex workflow
12961296
createindex = combineList(indexdb, extractorfs);
1297-
createindex = combineList(createindex, translatenucs);
1297+
createindex = combineList(createindex, extractframes);
12981298
createindex = combineList(createindex, splitsequence);
1299+
createindex.push_back(&PARAM_TRANSLATION_MODE);
12991300
createindex.push_back(&PARAM_STRAND);
13001301
createindex.push_back(&PARAM_REMOVE_TMP_FILES);
13011302
@@ -2281,7 +2282,7 @@ void Parameters::setDefaults() {
22812282
orfFilterSens = 2.0;
22822283
orfFilterEval = 100;
22832284
lcaSearch = false;
2284-
orfSkip = false;
2285+
translationMode = PARAM_TRANSLATION_MODE_ORF;
22852286

22862287
greedyBestHits = false;
22872288

src/commons/Parameters.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ class Parameters {
321321
static const int PARAM_RESULT_DIRECTION_QUERY = 0;
322322
static const int PARAM_RESULT_DIRECTION_TARGET = 1;
323323

324+
// translation mode
325+
static const int PARAM_TRANSLATION_MODE_ORF = 0;
326+
static const int PARAM_TRANSLATION_MODE_FRAME = 1;
327+
324328
// path to databases
325329
std::string db1;
326330
std::string db1Index;
@@ -467,7 +471,7 @@ class Parameters {
467471
float orfFilterSens;
468472
double orfFilterEval;
469473
bool lcaSearch;
470-
bool orfSkip;
474+
int translationMode;
471475

472476
// easysearch
473477
bool greedyBestHits;
@@ -887,7 +891,7 @@ class Parameters {
887891
PARAMETER(PARAM_ORF_FILTER_S)
888892
PARAMETER(PARAM_ORF_FILTER_E)
889893
PARAMETER(PARAM_LCA_SEARCH)
890-
PARAMETER(PARAM_ORF_SKIP)
894+
PARAMETER(PARAM_TRANSLATION_MODE)
891895

892896
// easysearch
893897
PARAMETER(PARAM_GREEDY_BEST_HITS)

src/util/extractframes.cpp

Lines changed: 60 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
#include "Matcher.h"
66
#include "Util.h"
77
#include "TranslateNucl.h"
8-
#include "itoa.h"
9-
108
#include "Orf.h"
119

1210
#include <unistd.h>
@@ -17,6 +15,41 @@
1715
#include <omp.h>
1816
#endif
1917

18+
void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) {
19+
data = data + frame;
20+
seqLen = seqLen - frame;
21+
if (translate == true) {
22+
if (seqLen < 3) {
23+
return;
24+
}
25+
size_t codonLength = (seqLen / 3) * 3;
26+
if ((codonLength + 1) > aaBufferSize) {
27+
aaBufferSize = codonLength * 1.5 + 1;
28+
aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char));
29+
}
30+
translateNucl.translate(aaBuffer, data, codonLength);
31+
aaBuffer[codonLength / 3] = '\n';
32+
sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx);
33+
size_t bufferLen;
34+
if (reverse) {
35+
bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast<size_t>(frame), 0, 0);
36+
} else {
37+
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), frame + codonLength, 0, 0);
38+
}
39+
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
40+
} else {
41+
// +1: add newline, but remove it from the end pos
42+
sequenceWriter.writeData(data, seqLen + 1, key, thread_idx);
43+
size_t bufferLen;
44+
if (reverse) {
45+
bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast<size_t>(frame), 0, 0);
46+
} else {
47+
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), seqLen - 1, 0, 0);
48+
}
49+
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
50+
}
51+
}
52+
2053
int extractframes(int argc, const char **argv, const Command& command) {
2154
Parameters& par = Parameters::getInstance();
2255
par.parseParameters(argc, argv, command, true, 0, 0);
@@ -25,7 +58,7 @@ int extractframes(int argc, const char **argv, const Command& command) {
2558
reader.open(DBReader<unsigned int>::NOSORT);
2659

2760
int outputDbtype = reader.getDbtype();
28-
if(par.translate) {
61+
if (par.translate) {
2962
outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
3063
}
3164
DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
@@ -52,157 +85,66 @@ int extractframes(int argc, const char **argv, const Command& command) {
5285
queryFrom = 0;
5386
}
5487

55-
char* aa = new char[par.maxSeqLen + 3 + 1];
88+
size_t aaBufferSize = par.maxSeqLen + 3 + 1;
89+
char* aa = NULL;
90+
if (par.translate == true) {
91+
aa = (char*)malloc(aaBufferSize * sizeof(char));
92+
}
93+
5694
char buffer[1024];
95+
5796
std::string reverseComplementStr;
5897
reverseComplementStr.reserve(32000);
98+
5999
for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
60100
progress.updateProgress();
61101

62102
unsigned int key = reader.getDbKey(i);
63103
const char* data = reader.getData(i, thread_idx);
64104
size_t seqLen = reader.getSeqLen(i);
65105

66-
size_t bufferLen;
67106
if (forwardFrames & Orf::FRAME_1) {
68-
if (par.translate) {
69-
size_t currSeqLen = seqLen + 1;
70-
if (currSeqLen >= 3) {
71-
if (currSeqLen > (3 * par.maxSeqLen)) {
72-
currSeqLen = (3 * par.maxSeqLen);
73-
}
74-
size_t condonLength = currSeqLen / 3 * 3;
75-
translateNucl.translate(aa, data, condonLength);
76-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
77-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), seqLen - 1, 0, 0);
78-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
79-
}
80-
} else {
81-
sequenceWriter.writeData(data, seqLen + 1, key, thread_idx);
82-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), seqLen - 1, 0, 0);
83-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
84-
}
107+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx);
85108
}
86109
if (forwardFrames & Orf::FRAME_2) {
87-
if (par.translate) {
88-
size_t currSeqLen = seqLen;
89-
if (currSeqLen >= 3) {
90-
if (currSeqLen > (3 * par.maxSeqLen)) {
91-
currSeqLen = (3 * par.maxSeqLen);
92-
}
93-
size_t condonLength = currSeqLen / 3 * 3;
94-
translateNucl.translate(aa, data + 1, condonLength);
95-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
96-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), seqLen - 2, 0, 0);
97-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
98-
}
99-
} else {
100-
sequenceWriter.writeData(data + 1, seqLen, key, thread_idx);
101-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), seqLen - 2, 0, 0);
102-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
103-
}
110+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx);
104111
}
105112
if (forwardFrames & Orf::FRAME_3) {
106-
if (par.translate) {
107-
size_t currSeqLen = seqLen - 1;
108-
if (currSeqLen >= 3) {
109-
if (currSeqLen > (3 * par.maxSeqLen)) {
110-
currSeqLen = (3 * par.maxSeqLen);
111-
}
112-
size_t condonLength = currSeqLen / 3 * 3;
113-
translateNucl.translate(aa, data + 2, condonLength);
114-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
115-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), seqLen - 3, 0, 0);
116-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
117-
}
118-
} else {
119-
sequenceWriter.writeData(data + 2, seqLen - 1, key, thread_idx);
120-
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), seqLen - 3, 0, 0);
121-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
122-
}
113+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx);
123114
}
124115

125-
126-
if(reverseFrames != 0){
127-
size_t sequenceLength = seqLen;
116+
if (reverseFrames != 0) {
128117
// bool hasWrongChar = false;
129-
for(size_t pos = 0; pos < sequenceLength; ++pos) {
130-
char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]);
118+
for (size_t pos = 0; pos < seqLen; ++pos) {
119+
char reverseComplement = Orf::complement(data[seqLen - pos - 1]);
131120
reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement;
132121
reverseComplementStr.push_back(reverseComplement);
133122
// hasWrongChar |= (reverseComplement == '.');
134123
}
135-
// if(hasWrongChar == true){
136-
// continue;
137-
// }
124+
// if (hasWrongChar == true) {
125+
// continue;
126+
// }
138127
reverseComplementStr.push_back('\n');
139-
140-
seqLen = reverseComplementStr.size();
128+
seqLen = reverseComplementStr.size() - 1;
141129
data = reverseComplementStr.c_str();
142130
}
143131

144132
if (reverseFrames & Orf::FRAME_1) {
145-
if (par.translate) {
146-
size_t currSeqLen = seqLen;
147-
if (currSeqLen >= 3) {
148-
if (currSeqLen > (3 * par.maxSeqLen)) {
149-
currSeqLen = (3 * par.maxSeqLen);
150-
}
151-
size_t condonLength = currSeqLen / 3 * 3;
152-
translateNucl.translate(aa, data, condonLength);
153-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
154-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast<size_t >(0), 0, 0);
155-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
156-
}
157-
} else {
158-
sequenceWriter.writeData(data, seqLen, key, thread_idx);
159-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast<size_t >(0), 0, 0);
160-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
161-
}
133+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx);
162134
}
163135

164136
if (reverseFrames & Orf::FRAME_2) {
165-
if (par.translate) {
166-
size_t currSeqLen = seqLen - 1;
167-
if (currSeqLen >= 3) {
168-
if (currSeqLen > (3 * par.maxSeqLen)) {
169-
currSeqLen = (3 * par.maxSeqLen);
170-
}
171-
size_t condonLength = currSeqLen / 3 * 3;
172-
translateNucl.translate(aa, data + 1, condonLength);
173-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
174-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast<size_t >(1), 0, 0);
175-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
176-
}
177-
} else {
178-
sequenceWriter.writeData(data + 1, seqLen - 1, key, thread_idx);
179-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast<size_t >(1), 0, 0);
180-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
181-
}
137+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx);
182138
}
183139

184140
if (reverseFrames & Orf::FRAME_3) {
185-
if (par.translate) {
186-
size_t currSeqLen = seqLen - 2;
187-
if (currSeqLen >= 3) {
188-
if (currSeqLen > (3 * par.maxSeqLen)) {
189-
currSeqLen = (3 * par.maxSeqLen);
190-
}
191-
size_t condonLength = currSeqLen / 3 * 3;
192-
translateNucl.translate(aa, data + 2, condonLength);
193-
sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx);
194-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast<size_t >(2), 0, 0);
195-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
196-
}
197-
} else {
198-
sequenceWriter.writeData(data + 2, seqLen - 2, key, thread_idx);
199-
bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast<size_t >(2), 0, 0);
200-
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
201-
}
141+
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx);
202142
}
203143
reverseComplementStr.clear();
204144
}
205-
delete[] aa;
145+
if (aa != NULL) {
146+
free(aa);
147+
}
206148
}
207149
headerWriter.close(true);
208150
sequenceWriter.close(true);

src/workflow/CreateIndex.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde
4242
cmd.addVariable("INDEXER", indexerModule.c_str());
4343
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
4444
par.translate = 1;
45-
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
46-
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
45+
cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
46+
if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
47+
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
48+
} else {
49+
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
50+
}
4751
cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str());
4852
if(indexerModule == "kmerindexdb"){
4953
cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str());

0 commit comments

Comments
 (0)