40
40
41
41
#include " common/compiler_util.h" // IWYU pragma: keep
42
42
#include " common/config.h"
43
+ #include " common/status.h"
43
44
#include " runtime/exec_env.h"
45
+ #include " service/backend_options.h"
44
46
#include " util/dns_cache.h"
45
47
#include " util/network_util.h"
46
48
@@ -56,6 +58,80 @@ using StubMap = phmap::parallel_flat_hash_map<
56
58
57
59
namespace doris {
58
60
61
+ class FailureDetectClosure : public ::google::protobuf::Closure {
62
+ public:
63
+ FailureDetectClosure (std::shared_ptr<AtomicStatus>& channel_st,
64
+ ::google::protobuf::RpcController* controller,
65
+ ::google::protobuf::Closure* done)
66
+ : _channel_st(channel_st), _controller(controller), _done(done) {}
67
+
68
+ void Run () override {
69
+ Defer defer {[&]() { delete this ; }};
70
+ // All brpc related API will use brpc::Controller, so that it is safe
71
+ // to do static cast here.
72
+ auto * cntl = static_cast <brpc::Controller*>(_controller);
73
+ if (cntl->Failed () && cntl->ErrorCode () == EHOSTDOWN) {
74
+ Status error_st = Status::NetworkError (
75
+ " Failed to send brpc, error={}, error_text={}, client: {}, latency = {}" ,
76
+ berror (cntl->ErrorCode ()), cntl->ErrorText (), BackendOptions::get_localhost (),
77
+ cntl->latency_us ());
78
+ LOG (WARNING) << error_st;
79
+ _channel_st->update (error_st);
80
+ }
81
+ // Sometimes done == nullptr, for example hand_shake API.
82
+ if (_done != nullptr ) {
83
+ _done->Run ();
84
+ }
85
+ // _done->Run may throw exception, so that move delete this to Defer.
86
+ // delete this;
87
+ }
88
+
89
+ private:
90
+ std::shared_ptr<AtomicStatus> _channel_st;
91
+ ::google::protobuf::RpcController* _controller;
92
+ ::google::protobuf::Closure* _done;
93
+ };
94
+
95
+ // This channel will use FailureDetectClosure to wrap the original closure
96
+ // If some non-recoverable rpc failure happens, it will save the error status in
97
+ // _channel_st.
98
+ // And brpc client cache will depend on it to detect if the client is health.
99
+ class FailureDetectChannel : public ::brpc::Channel {
100
+ public:
101
+ FailureDetectChannel () : ::brpc::Channel() {
102
+ _channel_st = std::make_shared<AtomicStatus>(); // default OK
103
+ }
104
+ void CallMethod (const google::protobuf::MethodDescriptor* method,
105
+ google::protobuf::RpcController* controller,
106
+ const google::protobuf::Message* request, google::protobuf::Message* response,
107
+ google::protobuf::Closure* done) override {
108
+ FailureDetectClosure* failure_detect_closure = nullptr ;
109
+ if (done != nullptr ) {
110
+ // If done == nullptr, then it means the call is sync call, so that should not
111
+ // gen a failure detect closure for it. Or it will core.
112
+ failure_detect_closure = new FailureDetectClosure (_channel_st, controller, done);
113
+ }
114
+ ::brpc::Channel::CallMethod (method, controller, request, response, failure_detect_closure);
115
+ // Done == nullptr, it is a sync call, should also deal with the bad channel.
116
+ if (done == nullptr ) {
117
+ auto * cntl = static_cast <brpc::Controller*>(controller);
118
+ if (cntl->Failed () && cntl->ErrorCode () == EHOSTDOWN) {
119
+ Status error_st = Status::NetworkError (
120
+ " Failed to send brpc, error={}, error_text={}, client: {}, latency = {}" ,
121
+ berror (cntl->ErrorCode ()), cntl->ErrorText (),
122
+ BackendOptions::get_localhost (), cntl->latency_us ());
123
+ LOG (WARNING) << error_st;
124
+ _channel_st->update (error_st);
125
+ }
126
+ }
127
+ }
128
+
129
+ std::shared_ptr<AtomicStatus> channel_status () { return _channel_st; }
130
+
131
+ private:
132
+ std::shared_ptr<AtomicStatus> _channel_st;
133
+ };
134
+
59
135
template <class T >
60
136
class BrpcClientCache {
61
137
public:
@@ -99,7 +175,14 @@ class BrpcClientCache {
99
175
auto get_value = [&stub_ptr](const auto & v) { stub_ptr = v.second ; };
100
176
if (LIKELY (_stub_map.if_contains (host_port, get_value))) {
101
177
DCHECK (stub_ptr != nullptr );
102
- return stub_ptr;
178
+ // All client created from this cache will use FailureDetectChannel, so it is
179
+ // safe to do static cast here.
180
+ // Check if the base channel is OK, if not ignore the stub and create new one.
181
+ if (static_cast <FailureDetectChannel*>(stub_ptr->channel ())->channel_status ()->ok ()) {
182
+ return stub_ptr;
183
+ } else {
184
+ _stub_map.erase (host_port);
185
+ }
103
186
}
104
187
105
188
// new one stub and insert into map
@@ -148,7 +231,7 @@ class BrpcClientCache {
148
231
options.timeout_ms = 2000 ;
149
232
options.max_retry = 10 ;
150
233
151
- std::unique_ptr<brpc::Channel > channel (new brpc::Channel ());
234
+ std::unique_ptr<FailureDetectChannel > channel (new FailureDetectChannel ());
152
235
int ret_code = 0 ;
153
236
if (host_port.find (" ://" ) == std::string::npos) {
154
237
ret_code = channel->Init (host_port.c_str (), &options);
0 commit comments