@@ -20,9 +20,10 @@ import (
20
20
var streamDownErr = status .Error (codes .Unavailable , "stream is down" )
21
21
22
22
type request struct {
23
- ctx context.Context
24
- msg * Message
25
- opts callOptions
23
+ ctx context.Context
24
+ msg * Message
25
+ opts callOptions
26
+ numFailed int
26
27
}
27
28
28
29
// waitForSend returns true if the WithNoSendWaiting call option is not set.
@@ -60,6 +61,7 @@ type channel struct {
60
61
cancelStream context.CancelFunc
61
62
responseRouters map [uint64 ]responseRouter
62
63
responseMut sync.Mutex
64
+ maxRetries int // number of times we try to resend a failed msg
63
65
}
64
66
65
67
// newChannel creates a new channel for the given node and starts the sending goroutine.
@@ -76,6 +78,7 @@ func newChannel(n *RawNode) *channel {
76
78
latency : - 1 * time .Second ,
77
79
rand : rand .New (rand .NewSource (time .Now ().UnixNano ())),
78
80
responseRouters : make (map [uint64 ]responseRouter ),
81
+ maxRetries : n .mgr .opts .maxRetries ,
79
82
}
80
83
// parentCtx controls the channel and is used to shut it down
81
84
c .parentCtx = n .newContext ()
@@ -227,10 +230,21 @@ func (c *channel) sendMsg(req request) (err error) {
227
230
case <- done :
228
231
// false alarm
229
232
default :
233
+ // CANCELLING HERE CAN HAVE DESTRUCTIVE EFFECTS!
234
+ // Imagine the client has sent several requests and is waiting
235
+ // for a response on each individual request. Furthermore, let's
236
+ // say the client has sent a message to two different handlers:
237
+ // 1. A handler that does a lot of work and thus long response times are expected.
238
+ // 2. A handler that is normally very fast.
239
+ //
240
+ // If the client is impatient and cancels a request sent to a handler in scenario 2,
241
+ // then all requests sent to the handler in scenario 1 will also be cancelled because
242
+ // the stream is taken down.
243
+
230
244
// trigger reconnect
231
- c .streamMut .Lock ()
232
- c .cancelStream ()
233
- c .streamMut .Unlock ()
245
+ // c.streamMut.Lock()
246
+ // c.cancelStream()
247
+ // c.streamMut.Unlock()
234
248
}
235
249
}
236
250
}()
@@ -266,14 +280,16 @@ func (c *channel) sender() {
266
280
}
267
281
// return error if stream is broken
268
282
if c .streamBroken .get () {
269
- c .routeResponse (req .msg .Metadata .MessageID , response {nid : c .node .ID (), err : streamDownErr })
283
+ //c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.node.ID(), err: streamDownErr})
284
+ go c .retryMsg (req , streamDownErr )
270
285
continue
271
286
}
272
287
// else try to send message
273
288
err := c .sendMsg (req )
274
289
if err != nil {
275
290
// return the error
276
- c .routeResponse (req .msg .Metadata .MessageID , response {nid : c .node .ID (), err : err })
291
+ //c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.node.ID(), err: err})
292
+ go c .retryMsg (req , err )
277
293
}
278
294
}
279
295
}
@@ -396,6 +412,38 @@ func (c *channel) reconnect(maxRetries float64) {
396
412
}
397
413
}
398
414
415
+ // This method should always be run in a goroutine. It will
416
+ // enqueue a msg if it has previously failed. The message will
417
+ // be dropped if it fails more than maxRetries or if the ctx
418
+ // is cancelled.
419
+ func (c * channel ) retryMsg (req request , err error ) {
420
+ req .numFailed ++
421
+ // c.maxRetries = -1, is the same as infinite retries.
422
+ if req .numFailed > c .maxRetries && c .maxRetries != - 1 {
423
+ c .routeResponse (req .msg .Metadata .MessageID , response {nid : c .node .ID (), err : fmt .Errorf ("max retries exceeded. err=%e" , err )})
424
+ return
425
+ }
426
+ //delay := float64(c.backoffCfg.BaseDelay)
427
+ delay := float64 (10 * time .Millisecond )
428
+ max := float64 (c .backoffCfg .MaxDelay )
429
+ for r := req .numFailed ; delay < max && r > 0 ; r -- {
430
+ delay *= c .backoffCfg .Multiplier
431
+ }
432
+ delay = math .Min (delay , max )
433
+ delay *= 1 + c .backoffCfg .Jitter * (rand .Float64 ()* 2 - 1 )
434
+ select {
435
+ case <- c .parentCtx .Done ():
436
+ c .routeResponse (req .msg .Metadata .MessageID , response {nid : c .node .ID (), err : fmt .Errorf ("channel closed" )})
437
+ return
438
+ case <- req .ctx .Done ():
439
+ c .routeResponse (req .msg .Metadata .MessageID , response {nid : c .node .ID (), err : fmt .Errorf ("context cancelled" )})
440
+ return
441
+ case <- time .After (time .Duration (delay )):
442
+ // enqueue the request again
443
+ }
444
+ c .enqueueSlow (req )
445
+ }
446
+
399
447
func (c * channel ) setLastErr (err error ) {
400
448
c .mu .Lock ()
401
449
defer c .mu .Unlock ()
0 commit comments