Skip to content

Commit 91805e2

Browse files
committed
Changed to keep map of DNS addresses due to 5 second DNS delays.
1 parent d2eb8d9 commit 91805e2

File tree

18 files changed

+114
-129
lines changed

18 files changed

+114
-129
lines changed

admin/templates/configuration/etc/log4cxx.index.properties

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
# export LSST_LOG_CONFIG=$HOME/.lsst/log4cxx.unittest.properties
66
#
77

8-
#log4j.rootLogger=INFO, CONSOLE
9-
log4j.rootLogger=DEBUG, CONSOLE
8+
log4j.rootLogger=INFO, CONSOLE
9+
#log4j.rootLogger=DEBUG, CONSOLE
10+
#log4j.rootLogger=WARN, CONSOLE
1011

1112
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
1213
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout

admin/tools/docker/index/container/dev/README

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
# These are the optional building steps (in case if this container needs to
77
# be rebuilt from scratch (which is going to take ~2 hours or longer)
88

9-
#docker build -t qserv/replica:dev . &&&
10-
#docker push qserv/replica:dev &&&
119
docker build -t qserv/loaderbase:dev .
1210
docker push qserv/loaderbase:dev
1311

admin/tools/docker/index/container/dev/master/appMaster.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ source /qserv/stack/loadLSST.bash
1313
cd /home/qserv/dev/qserv
1414
setup -r . -t qserv-dev
1515

16-
export LSST_LOG_CONFIG=/home/qserv/dev/qserv/admin/templates/configuration/etc/log4cxx.index.properties
16+
export LSST_LOG_CONFIG=/home/qserv/dev/qserv/admin/templates/configuration/etc/log4cxx.index_master.properties
1717

1818
/home/qserv/dev/qserv/build/loader/appMaster /home/qserv/dev/qserv/core/modules/loader/config/master.cnf
1919

core/modules/loader/BufferUdp.cc

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -134,20 +134,7 @@ std::shared_ptr<MsgElement> BufferUdp::_safeRetrieve(std::string const& note) {
134134

135135
bool BufferUdp::isRetrieveSafe(size_t len) const {
136136
auto newLen = (_rCursor + len);
137-
// &&&return (newLen <= _end && newLen <= _wCursor);
138-
bool res = (newLen <= _end && newLen <= _wCursor); // &&&
139-
if (!res) { // &&&
140-
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::isRetrieveSafe not safe len=" << len <<
141-
" rCursor=" << (void*)_rCursor <<
142-
" newLen=" << (void*)newLen <<
143-
" wCursor=" << (void*)_wCursor <<
144-
" _end=" << (void*)_end <<
145-
" (newLen<=end)=" << (newLen <= _end) <<
146-
" (newLen<=_wCursor)=" << (newLen <= _wCursor) <<
147-
" res=" << res);
148-
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::isRetrieveSafe " << dumpStr(false));
149-
}
150-
return res;
137+
return (newLen <= _end && newLen <= _wCursor);
151138
}
152139

153140

@@ -157,7 +144,7 @@ bool BufferUdp::retrieve(void* out, size_t len) {
157144
_rCursor += len;
158145
return true;
159146
}
160-
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::retrieve not safe len=" << len);
147+
LOGS(_log, LOG_LVL_DEBUG, "BufferUdp::retrieve not safe len=" << len);
161148
return false;
162149
}
163150

core/modules/loader/CentralClient.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ void CentralClient::_handleKeyLookup(LoaderMsg const& inMsg, std::unique_ptr<pro
128128
_waitingKeyLookupMap.erase(iter);
129129
}
130130
keyLookupOneShot->keyInfoComplete(key, chunkInfo.chunk, chunkInfo.subchunk, protoData->success());
131-
LOGS(_log, LOG_LVL_INFO, "Successful KEY_LOOKUP key=" << key << " " << chunkInfo);
131+
LOGS(_log, LOG_LVL_WARN, "&&&INFO Successful KEY_LOOKUP key=" << key << " " << chunkInfo);
132132
}
133133

134134

@@ -173,7 +173,7 @@ void CentralClient::_handleKeyInsertComplete(LoaderMsg const& inMsg, std::unique
173173
mapSize = _waitingKeyInsertMap.size();
174174
}
175175
keyInsertOneShot->keyInsertComplete();
176-
LOGS(_log, LOG_LVL_INFO, "Successful KEY_INSERT_COMPLETE key=" << key << " " << chunkInfo <<
176+
LOGS(_log, LOG_LVL_WARN, "&&&INFO Successful KEY_INSERT_COMPLETE key=" << key << " " << chunkInfo <<
177177
" mapSize=" << mapSize);
178178
}
179179

@@ -355,7 +355,7 @@ void CentralClient::getWorkerForKey(CompositeKey const& key, std::string& ip, in
355355
auto nAddr = worker->getUdpAddress();
356356
ip = nAddr.ip;
357357
port = nAddr.port;
358-
LOGS(_log, LOG_LVL_DEBUG, "getWorkerForKey " << key << " worker=" << worker);
358+
LOGS(_log, LOG_LVL_DEBUG, "getWorkerForKey " << key << " worker=" << *worker);
359359
} else {
360360
ip = getDefWorkerHost();
361361
port = getDefWorkerPortUdp();

core/modules/loader/CentralWorker.cc

Lines changed: 19 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "loader/WorkerConfig.h"
3737
#include "proto/loader.pb.h"
3838
#include "proto/ProtoImporter.h"
39+
#include "util/Timer.h" // &&&
3940

4041

4142
// LSST headers
@@ -126,6 +127,7 @@ void CentralWorker::_monitor() {
126127
usleep(500000);
127128
return;
128129
}
130+
LOGS(_log, LOG_LVL_WARN, "&&& CentralWorker::_monitor A");
129131

130132
// If data gets shifted, check everything again as ranges will have
131133
// changed and there may be a lot more data to shift.
@@ -191,6 +193,7 @@ void CentralWorker::_monitor() {
191193
_sendWorkerKeysInfo(masterAddr, getNextMsgId());
192194
}
193195
} while (dataShifted);
196+
LOGS(_log, LOG_LVL_WARN, "&&& CentralWorker::_monitor Z");
194197
}
195198

196199

@@ -667,80 +670,6 @@ void CentralWorker::cancelShiftsWithLeftNeighbor() {
667670
}
668671
}
669672

670-
/* &&&
671-
bool CentralWorker::workerInfoReceive(BufferUdp::Ptr const& data) {
672-
// Open the data protobuffer and add it to our list.
673-
StringElement::Ptr sData = std::dynamic_pointer_cast<StringElement>(MsgElement::retrieve(*data, " CentralWorker::workerInfoReceive&&& "));
674-
if (sData == nullptr) {
675-
LOGS(_log, LOG_LVL_WARN, "CentralWorker::workerInfoRecieve Failed to parse list");
676-
return false;
677-
}
678-
std::unique_ptr<proto::WorkerListItem> protoList = sData->protoParse<proto::WorkerListItem>();
679-
if (protoList == nullptr) {
680-
LOGS(_log, LOG_LVL_WARN, "CentralWorker::workerInfoRecieve Failed to parse list");
681-
return false;
682-
}
683-
684-
// TODO: move this call to another thread
685-
_workerInfoReceive(protoList);
686-
return true;
687-
}
688-
*/
689-
690-
/* &&&
691-
void CentralWorker::_workerInfoReceive(std::unique_ptr<proto::WorkerListItem>& protoL) {
692-
std::unique_ptr<proto::WorkerListItem> protoList(std::move(protoL));
693-
694-
// Check the information, if it is our network address, set or check our id.
695-
// Then compare it with the map, adding new/changed information.
696-
uint32_t wId = protoList->wid();
697-
std::string ipUdp("");
698-
int portUdp = 0;
699-
int portTcp = 0;
700-
if (protoList->has_address()) {
701-
proto::LdrNetAddress protoAddr = protoList->address();
702-
ipUdp = protoAddr.ip();
703-
portUdp = protoAddr.udpport();
704-
portTcp = protoAddr.tcpport();
705-
}
706-
KeyRange strRange;
707-
if (protoList->has_range()) {
708-
proto::WorkerRange protoRange = protoList->range();
709-
bool valid = protoRange.valid();
710-
if (valid) {
711-
CompositeKey min(protoRange.minint(), protoRange.minstr());
712-
CompositeKey max(protoRange.maxint(), protoRange.maxstr());
713-
bool unlimited = protoRange.maxunlimited();
714-
strRange.setMinMax(min, max, unlimited);
715-
}
716-
}
717-
718-
// If the address matches ours, check the name.
719-
if (getHostName() == ipUdp && getUdpPort() == portUdp) {
720-
if (_isOurIdInvalid()) {
721-
LOGS(_log, LOG_LVL_INFO, "Setting our name " << wId);
722-
_setOurId(wId);
723-
} else if (getOurId() != wId) {
724-
LOGS(_log, LOG_LVL_ERROR, "Our wId doesn't match address from master! wId=" <<
725-
getOurId() << " from master=" << wId);
726-
}
727-
728-
// It is this worker. If there is a valid range in the message and our range is not valid,
729-
// take the range given as our own.
730-
if (strRange.getValid()) {
731-
std::lock_guard<std::mutex> lckM(_idMapMtx);
732-
if (not _keyRange.getValid()) {
733-
LOGS(_log, LOG_LVL_INFO, "Setting our range " << strRange);
734-
_keyRange.setMinMax(strRange.getMin(), strRange.getMax(), strRange.getUnlimited());
735-
}
736-
}
737-
}
738-
739-
// Make/update entry in map.
740-
_wWorkerList->updateEntry(wId, ipUdp, portUdp, portTcp, strRange);
741-
}
742-
*/
743-
744673

745674
void CentralWorker::checkForThisWorkerValues(uint32_t wId, std::string const& ip,
746675
int portUdp, int portTcp, KeyRange& strRange) {
@@ -826,6 +755,9 @@ bool CentralWorker::workerKeyInsertReq(LoaderMsg const& inMsg, BufferUdp::Ptr co
826755
}
827756

828757

758+
util::Timer lastInsertTimer; // &&&
759+
std::mutex lastInsertTimerMtx; // &&&
760+
829761
void CentralWorker::_workerKeyInsertReq(LoaderMsg const& inMsg, std::unique_ptr<proto::KeyInfoInsert>& protoBuf) {
830762
std::unique_ptr<proto::KeyInfoInsert> protoData(std::move(protoBuf));
831763

@@ -850,7 +782,17 @@ void CentralWorker::_workerKeyInsertReq(LoaderMsg const& inMsg, std::unique_ptr<
850782
// Element already found, check file id and row number. Bad if not the same.
851783
// TODO HIGH send back duplicate key mismatch message to the original requester and return
852784
}
853-
LOGS(_log, LOG_LVL_INFO, "Key inserted=" << key << "(" << chunkInfo << ")");
785+
{
786+
std::lock_guard<std::mutex> tLg(lastInsertTimerMtx);
787+
lastInsertTimer.stop();
788+
auto elapsedInsert = lastInsertTimer.getElapsed();
789+
if (elapsedInsert > 0.5) {
790+
LOGS(_log, LOG_LVL_ERROR, "&&& Longdelay key=" << key << " dlay=" << elapsedInsert);
791+
}
792+
// &&& LOGS(_log, LOG_LVL_INFO, "Key inserted=" << key << "(" << chunkInfo << ")");
793+
LOGS(_log, LOG_LVL_WARN, "&&&INFO Key inserted=" << key << "(" << chunkInfo << ") dlay=" << elapsedInsert);
794+
lastInsertTimer.start();
795+
}
854796
// TODO Send this item to the keyLogger (which would then send KEY_INSERT_COMPLETE back to the requester),
855797
// for now this function will send the message back for proof of concept.
856798
LoaderMsg msg(LoaderMsg::KEY_INSERT_COMPLETE, inMsg.msgId->element, getHostName(), getUdpPort());
@@ -898,10 +840,10 @@ void CentralWorker::_forwardKeyInsertRequest(NetworkAddress const& targetAddr, L
898840
// The proto buffer should be the same, just need a new message.
899841
int hops = protoData->hops() + 1;
900842
if (hops > 4) { // TODO replace magic number with variable set via config file.
901-
LOGS(_log, LOG_LVL_INFO, "Too many hops, dropping insert request hops=" << hops << " key=" << key);
843+
LOGS(_log, LOG_LVL_WARN, "Too many hops, dropping insert request hops=" << hops << " key=" << key);
902844
return;
903845
}
904-
LOGS(_log, LOG_LVL_INFO, "Forwarding key insert hops=" << hops << " key=" << key);
846+
LOGS(_log, LOG_LVL_WARN, "&&&INFO Forwarding key insert hops=" << hops << " key=" << key);
905847
LoaderMsg msg(LoaderMsg::KEY_INSERT_REQ, inMsg.msgId->element, getHostName(), getUdpPort());
906848
BufferUdp msgData;
907849
msg.appendToData(msgData);

core/modules/loader/DoListItem.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ class DoListItem : public std::enable_shared_from_this<DoListItem> {
9898

9999
virtual ~DoListItem() = default;
100100

101+
util::CommandTracked::Ptr runIfNeeded(TimeOut::TimePoint now);
102+
/* &&&
101103
util::CommandTracked::Ptr runIfNeeded(TimeOut::TimePoint now) {
102104
std::lock_guard<std::mutex> lock(_mtx);
103105
if (_command == nullptr) {
@@ -117,6 +119,7 @@ class DoListItem : public std::enable_shared_from_this<DoListItem> {
117119
}
118120
return nullptr;
119121
}
122+
*/
120123

121124
bool isAlreadyOnList() { return _addedToList; }
122125

@@ -177,7 +180,7 @@ class DoListItem : public std::enable_shared_from_this<DoListItem> {
177180
/// If no info is needed, check for info after this period of time.
178181
TimeOut _timeOut{std::chrono::minutes(5)};
179182
/// Rate limiter, no more than 1 message every few seconds
180-
TimeOut _timeRateLimit{std::chrono::milliseconds(1500)}; // TODO: DM-17453 set via config
183+
TimeOut _timeRateLimit{std::chrono::milliseconds(7500)}; // TODO: DM-17453 set via config
181184
util::CommandTracked::Ptr _command;
182185
std::mutex _mtx; ///< protects _timeOut, _timeRequest, _command, _oneShot, _needInfo
183186
/// Number of times the command needed to be created. It's only tracked for oneShots as

core/modules/loader/ServerTcpBase.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ void TcpBaseConnection::_handleImYourLNeighbor1(boost::system::error_code const&
443443
CompositeKey maxKey(protoRange.maxint(), protoRange.maxstr());
444444
bool unlimited = protoRange.maxunlimited();
445445
leftRange.setMinMax(minKey, maxKey, unlimited);
446-
LOGS(_log, LOG_LVL_WARN, funcName << " leftRange=" << leftRange);
446+
LOGS(_log, LOG_LVL_INFO, funcName << " leftRange=" << leftRange);
447447
newLeftRange = _serverTcpBase->getCentralWorker()->updateRangeWithLeftData(leftRange);
448448
}
449449
proto::Neighbor protoLeftNeigh = protoItem->left();

core/modules/loader/ServerUdpBase.cc

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ void ServerUdpBase::_receiveCallback(boost::system::error_code const& error, siz
7474

7575

7676
void ServerUdpBase::_sendResponse() {
77+
LOGS(_log, LOG_LVL_INFO, "&&&udp:_sendResponse");
7778
_socket.async_send_to(boost::asio::buffer(_sendData->getReadCursor(), _sendData->getBytesLeftToRead()),
7879
_senderEndpoint,
7980
[this](boost::system::error_code const& ec, std::size_t bytes_transferred) {
@@ -87,7 +88,16 @@ void ServerUdpBase::sendBufferTo(std::string const& hostName, int port, BufferUd
8788
using namespace boost::asio;
8889
LOGS(_log, LOG_LVL_DEBUG, "ServerUdpBase::sendBufferTo hostName=" << hostName << " port=" << port);
8990
try {
90-
ip::udp::endpoint dest = resolve(hostName, port);
91+
NetworkAddress addr(hostName, port);
92+
ip::udp::endpoint dest;
93+
auto iter = _resolvMap.find(addr);
94+
if (iter == _resolvMap.end()) {
95+
dest = resolve(hostName, port); // may throw boost::system::system_error
96+
_resolvMap[addr] = dest;
97+
} else {
98+
dest = iter->second;
99+
}
100+
// &&& ip::udp::endpoint dest = resolve(hostName, port);
91101
_socket.send_to(buffer(sendBuf.getReadCursor(), sendBuf.getBytesLeftToRead()), dest);
92102
} catch (boost::system::system_error const& e) {
93103
LOGS(_log, LOG_LVL_ERROR, "ServerUdpBase::sendBufferTo boost system_error=" << e.what() <<
@@ -114,6 +124,7 @@ void ServerUdpBase::_sendCallback(const boost::system::error_code& error, size_t
114124
}
115125

116126
void ServerUdpBase::_receivePrepare() {
127+
LOGS(_log, LOG_LVL_INFO, "&&&udp:_receivePrepare");
117128
_data = std::make_shared<BufferUdp>(); // New buffer for next response, the old buffer
118129
// may still be in use elsewhere.
119130
_socket.async_receive_from(boost::asio::buffer(_data->getWriteCursor(),
@@ -130,8 +141,10 @@ boost::asio::ip::udp::endpoint ServerUdpBase::resolve(std::string const& hostNam
130141
using namespace boost::asio;
131142
// Resolver returns an iterator. This uses the first item only.
132143
// Failure to resolve anything throws a boost::system::error.
144+
LOGS(_log, LOG_LVL_INFO, "&&&udp:resolve a");
133145
ip::udp::endpoint dest =
134146
*_resolver.resolve(ip::udp::v4(), hostName, std::to_string(port)).begin();
147+
LOGS(_log, LOG_LVL_INFO, "&&&udp:resolve b");
135148
return dest;
136149
}
137150

core/modules/loader/ServerUdpBase.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
// Qserv headers
3434
#include "loader/BufferUdp.h"
35+
#include "loader/NetworkAddress.h"
3536

3637
namespace lsst {
3738
namespace qserv {
@@ -87,6 +88,10 @@ class ServerUdpBase {
8788
std::string _hostName;
8889
int _port;
8990

91+
/// Map and mutex to store ip destinations
92+
// TODO: add occasional checks to see if addresses changed
93+
std::map<NetworkAddress, boost::asio::ip::udp::endpoint> _resolvMap;
94+
9095
/// Items for resolving UDP addresses
9196
/// There appear to be concurrency issues even with
9297
/// separate io_contexts, so re-using existing objects.

0 commit comments

Comments
 (0)