Skip to content

Commit e095774

Browse files
committed
gpu client waits for gpuserver to be ready
1 parent 492297b commit e095774

File tree

7 files changed

+68
-19
lines changed

7 files changed

+68
-19
lines changed

src/commons/GpuUtil.h

+8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#define GPUUTIL_H
33

44
#include "Debug.h"
5+
#include "FileUtil.h"
6+
#include "PrefilteringIndexReader.h"
57
#include "marv.h"
68
#include <atomic>
79
#include <cstring>
@@ -37,6 +39,12 @@ struct GPUSharedMemory {
3739
sizeof(int8_t) * 20 * maxSeqLen; // Size for profile data
3840
}
3941

42+
static std::string getShmHash(const std::string& db) {
43+
std::string dbpath = FileUtil::getRealPathFromSymLink(PrefilteringIndexReader::dbPathWithoutIndex(db));
44+
size_t hash = Util::hash(dbpath.c_str(), dbpath.length());
45+
return SSTR(hash);
46+
}
47+
4048
// Allocate and initialize shared memory
4149
static GPUSharedMemory* alloc(const std::string& name, unsigned int maxSeqLen, unsigned int maxResListLen) {
4250
size_t shm_size = calculateSize(maxSeqLen, maxResListLen);

src/commons/Parameters.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ Parameters::Parameters():
104104
// gpu
105105
PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
106106
PARAM_GPU_SERVER(PARAM_GPU_SERVER_ID, "--gpu-server", "Use GPU server", "Use GPU server", typeid(int), (void *) &gpuServer, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
107+
PARAM_GPU_SERVER_WAIT_TIMEOUT(PARAM_GPU_SERVER_WAIT_TIMEOUT_ID, "--gpu-server-wait-timeout", "Wait for GPU server", "Wait for GPU server for 0: don't wait -1: no wait limit: >0 this many seconds", typeid(int), (void *) &gpuServerWaitTimeout, "^-?[0-9]+", MMseqsParameter::COMMAND_COMMON),
107108
// convertalignments
108109
PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID, "--format-mode", "Alignment format", "Output format:\n0: BLAST-TAB\n1: SAM\n2: BLAST-TAB + query/db length\n3: Pretty HTML\n4: BLAST-TAB + column headers\nBLAST-TAB (0) and BLAST-TAB + column headers (4) support custom output formats (--format-output)", typeid(int), (void *) &formatAlignmentMode, "^[0-4]{1}$"),
109110
PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID, "--format-output", "Format alignment output", "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,qorfstart,qorfend,torfstart,torfend,ppos", typeid(std::string), (void *) &outfmt, ""),
@@ -455,6 +456,7 @@ Parameters::Parameters():
455456
ungappedprefilter.push_back(&PARAM_PRELOAD_MODE);
456457
ungappedprefilter.push_back(&PARAM_GPU);
457458
ungappedprefilter.push_back(&PARAM_GPU_SERVER);
459+
ungappedprefilter.push_back(&PARAM_GPU_SERVER_WAIT_TIMEOUT);
458460
ungappedprefilter.push_back(&PARAM_PREF_MODE);
459461
ungappedprefilter.push_back(&PARAM_THREADS);
460462
ungappedprefilter.push_back(&PARAM_COMPRESSED);
@@ -1357,6 +1359,7 @@ Parameters::Parameters():
13571359
clusterworkflow = combineList(clusterworkflow, linclustworkflow);
13581360
clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU);
13591361
clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU_SERVER);
1362+
clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
13601363
13611364
// easyclusterworkflow
13621365
easyclusterworkflow = combineList(clusterworkflow, createdb);
@@ -1400,6 +1403,7 @@ Parameters::Parameters():
14001403
clusterUpdate.push_back(&PARAM_RECOVER_DELETED);
14011404
clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU);
14021405
clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU_SERVER);
1406+
clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU_SERVER_WAIT_TIMEOUT);
14031407
14041408
mapworkflow = combineList(prefilter, rescorediagonal);
14051409
mapworkflow = combineList(mapworkflow, extractorfs);
@@ -1410,6 +1414,7 @@ Parameters::Parameters():
14101414
mapworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
14111415
mapworkflow = removeParameter(mapworkflow, PARAM_GPU);
14121416
mapworkflow = removeParameter(mapworkflow, PARAM_GPU_SERVER);
1417+
mapworkflow = removeParameter(mapworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
14131418
14141419
enrichworkflow = combineList(searchworkflow, prefilter);
14151420
enrichworkflow = combineList(enrichworkflow, subtractdbs);
@@ -1418,6 +1423,7 @@ Parameters::Parameters():
14181423
enrichworkflow = combineList(enrichworkflow, result2profile);
14191424
enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU);
14201425
enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU_SERVER);
1426+
enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
14211427
14221428
databases.push_back(&PARAM_HELP);
14231429
databases.push_back(&PARAM_HELP_LONG);
@@ -2468,6 +2474,7 @@ void Parameters::setDefaults() {
24682474
}
24692475
#endif
24702476
gpuServer = 0;
2477+
gpuServerWaitTimeout = 10 * 60;
24712478
#ifdef HAVE_CUDA
24722479
char* gpuServerEnv = getenv("MMSEQS_FORCE_GPUSERVER");
24732480
if (gpuServerEnv != NULL) {

src/commons/Parameters.h

+2
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ class Parameters {
394394
int verbosity; // log level
395395
int gpu; // use GPU
396396
int gpuServer; // use the gpu server
397+
int gpuServerWaitTimeout; // wait for this many seconds until GPU server is ready
397398
int threads; // Amounts of threads
398399
int compressed; // compressed writer
399400
bool removeTmpFiles; // Do not delete temp files
@@ -820,6 +821,7 @@ class Parameters {
820821
// gpu
821822
PARAMETER(PARAM_GPU)
822823
PARAMETER(PARAM_GPU_SERVER)
824+
PARAMETER(PARAM_GPU_SERVER_WAIT_TIMEOUT)
823825
// format alignment
824826
PARAMETER(PARAM_FORMAT_MODE)
825827
PARAMETER(PARAM_FORMAT_OUTPUT)

src/prefiltering/PrefilteringIndexReader.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ std::string PrefilteringIndexReader::searchForIndex(const std::string &pathToDB)
593593
return "";
594594
}
595595

596-
std::string PrefilteringIndexReader::dbPathWithoutIndex(std::string & dbname) {
596+
std::string PrefilteringIndexReader::dbPathWithoutIndex(const std::string& dbname) {
597597
std::string rawname = dbname;
598598
// check for .idx
599599
size_t idxlastpos = dbname.rfind(".idx");

src/prefiltering/PrefilteringIndexReader.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class PrefilteringIndexReader {
8686

8787
static std::string searchForIndex(const std::string &pathToDB);
8888

89-
static std::string dbPathWithoutIndex(std::string &dbname);
89+
static std::string dbPathWithoutIndex(const std::string &dbname);
9090

9191
private:
9292
static void printMeta(int *meta);

src/prefiltering/ungappedprefilter.cpp

+46-10
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
#include <fcntl.h>
2020
#include <sys/mman.h>
21+
#include <chrono>
22+
#include <thread>
2123

2224
#ifdef OPENMP
2325
#include <omp.h>
@@ -60,14 +62,48 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
6062
memset(compositionBias, 0, compBufferSize);
6163
}
6264

63-
// hash the realpath of par.db2
64-
std::string tdbrName = par.db2;
65-
std::string tdbrRelPath = FileUtil::getRealPathFromSymLink(PrefilteringIndexReader::dbPathWithoutIndex(tdbrName));
66-
size_t hash = Util::hash(tdbrRelPath.c_str(), tdbrRelPath.length());
67-
std::string shmFileInFile = (par.gpuServer == 0) ? "" : "/dev/shm/" + SSTR(hash);
68-
if (shmFileInFile != "" && FileUtil::fileExists(shmFileInFile.c_str()) == false) {
69-
Debug(Debug::ERROR) << "--gpu-server " << shmFileInFile << " does not exist";
70-
EXIT(EXIT_FAILURE);
65+
std::string hash = "";
66+
if (par.gpuServer != 0) {
67+
hash = GPUSharedMemory::getShmHash(par.db2);
68+
std::string path = "/dev/shm/" + hash;
69+
int waitTimeout = par.gpuServerWaitTimeout;
70+
std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();
71+
bool statusPrinted = false;
72+
while (true) {
73+
size_t shmSize = FileUtil::getFileSize(path);
74+
// server is ready once the shm file exists and is not 0 byte large
75+
if (shmSize != (size_t)-1 && shmSize > 0) {
76+
break;
77+
}
78+
79+
if (waitTimeout == 0) {
80+
Debug(Debug::ERROR)
81+
<< "gpuserver for database " << par.db2 << " not found.\n"
82+
<< "Please start gpuserver with the same CUDA_VISIBLE_DEVICES\n";
83+
EXIT(EXIT_FAILURE);
84+
}
85+
86+
if (waitTimeout > 0) {
87+
if (statusPrinted == false) {
88+
Debug(Debug::INFO) << "Waiting for `gpuserver`";
89+
statusPrinted = true;
90+
} else {
91+
Debug(Debug::INFO) << ".";
92+
}
93+
std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
94+
auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
95+
if (elapsed >= waitTimeout) {
96+
Debug(Debug::ERROR)
97+
<< "gpuserver for database " << par.db2 << " not found after " << elapsed << "seconds.\n"
98+
<< "Please start gpuserver with the same CUDA_VISIBLE_DEVICES\n";
99+
EXIT(EXIT_FAILURE);
100+
}
101+
}
102+
std::this_thread::sleep_for(std::chrono::milliseconds(500));
103+
}
104+
if (waitTimeout > 0 && statusPrinted) {
105+
Debug(Debug::INFO) << "\n";
106+
}
71107
}
72108

73109
size_t* offsetData = NULL;
@@ -76,7 +112,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
76112
std::vector<int32_t> lengths;
77113
GPUSharedMemory* layout = NULL;
78114
pid_t pid = 0; // current process ID, only for server
79-
if (FileUtil::fileExists(shmFileInFile.c_str()) == false) {
115+
if (hash.empty()) {
80116
offsets.reserve(tdbr->getSize() + 1);
81117
lengths.reserve(tdbr->getSize());
82118
for (size_t id = 0; id < tdbr->getSize(); id++) {
@@ -88,7 +124,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
88124
lengthData = lengths.data();
89125
} else {
90126
pid = getpid();
91-
layout = GPUSharedMemory::openSharedMemory(SSTR(hash));
127+
layout = GPUSharedMemory::openSharedMemory(hash);
92128
}
93129

94130
const bool serverMode = par.gpuServer;

src/util/gpuserver.cpp

+3-7
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,6 @@ int gpuserver(int argc, const char **argv, const Command& command) {
4848
offsets.emplace_back(offsets.back() + lengths.back());
4949
int32_t maxTargetLength = lengths.back();
5050

51-
std::string dbrName = par.db1;
52-
std::string dbrRelPath = FileUtil::getRealPathFromSymLink(PrefilteringIndexReader::dbPathWithoutIndex(dbrName));
53-
size_t hash = Util::hash(dbrRelPath.c_str(), dbrRelPath.length());
54-
55-
std::string shmFile = SSTR(hash);
56-
GPUSharedMemory* layout = GPUSharedMemory::alloc(shmFile, par.maxSeqLen , par.maxResListLen); // Adjust sizes as necessary
57-
5851
BaseMatrix *subMat;
5952
if (Parameters::isEqualDbtype(dbrIdx.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
6053
subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
@@ -78,6 +71,9 @@ int gpuserver(int argc, const char **argv, const Command& command) {
7871
// Set up the handler for SIGINT and SIGTERM
7972
sigaction(SIGINT, &act, NULL);
8073
sigaction(SIGTERM, &act, NULL);
74+
75+
std::string shmFile = GPUSharedMemory::getShmHash(par.db1);
76+
GPUSharedMemory* layout = GPUSharedMemory::alloc(shmFile, par.maxSeqLen, par.maxResListLen);
8177
Debug(Debug::WARNING) << shmFile << "\n";
8278
while (keepRunning) {
8379
while (layout->serverReady.load(std::memory_order_acquire) == 0 || layout->clientReady.load(std::memory_order_acquire) == 0) {

0 commit comments

Comments
 (0)