Skip to content

Commit 976444f

Browse files
fix(connection): will retry connection for each message
The channel will try to connect to the node for each message if a connection has not yet been established or if the node has disconnected.
1 parent b44fc9a commit 976444f

File tree

5 files changed

+110
-36
lines changed

5 files changed

+110
-36
lines changed

channel.go

+59-8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package gorums
22

33
import (
44
"context"
5+
"fmt"
56
"math"
67
"math/rand"
78
"sync"
@@ -35,7 +36,7 @@ type responseRouter struct {
3536

3637
type channel struct {
3738
sendQ chan request
38-
nodeID uint32
39+
node *RawNode
3940
mu sync.Mutex
4041
lastError error
4142
latency time.Duration
@@ -50,29 +51,39 @@ type channel struct {
5051
cancelStream context.CancelFunc
5152
responseRouters map[uint64]responseRouter
5253
responseMut sync.Mutex
54+
connEstablished bool
5355
}
5456

5557
func newChannel(n *RawNode) *channel {
5658
return &channel{
5759
sendQ: make(chan request, n.mgr.opts.sendBuffer),
5860
backoffCfg: n.mgr.opts.backoff,
59-
nodeID: n.ID(),
61+
node: n,
6062
latency: -1 * time.Second,
6163
rand: rand.New(rand.NewSource(time.Now().UnixNano())),
6264
responseRouters: make(map[uint64]responseRouter),
65+
connEstablished: false,
6366
}
6467
}
6568

6669
func (c *channel) connect(ctx context.Context, conn *grpc.ClientConn) error {
67-
var err error
6870
c.parentCtx = ctx
71+
go c.sendMsgs()
72+
if conn == nil {
73+
return fmt.Errorf("connection is nil")
74+
}
75+
return c.tryConnect(conn)
76+
}
77+
78+
func (c *channel) tryConnect(conn *grpc.ClientConn) error {
79+
var err error
6980
c.streamCtx, c.cancelStream = context.WithCancel(c.parentCtx)
7081
c.gorumsClient = ordering.NewGorumsClient(conn)
7182
c.gorumsStream, err = c.gorumsClient.NodeStream(c.streamCtx)
7283
if err != nil {
7384
return err
7485
}
75-
go c.sendMsgs()
86+
c.connEstablished = true
7687
go c.recvMsgs()
7788
return nil
7889
}
@@ -160,17 +171,23 @@ func (c *channel) sendMsgs() {
160171
return
161172
case req = <-c.sendQ:
162173
}
174+
// try to connect to the node if previous attempts
175+
// have failed or if the node has disconnected
176+
if !c.isConnected() {
177+
// streamBroken will be set if the connection fails
178+
c.tryReconnect()
179+
}
163180
// return error if stream is broken
164181
if c.streamBroken.get() {
165182
err := status.Errorf(codes.Unavailable, "stream is down")
166-
c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.nodeID, msg: nil, err: err})
183+
c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.node.ID(), msg: nil, err: err})
167184
continue
168185
}
169186
// else try to send message
170187
err := c.sendMsg(req)
171188
if err != nil {
172189
// return the error
173-
c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.nodeID, msg: nil, err: err})
190+
c.routeResponse(req.msg.Metadata.MessageID, response{nid: c.node.ID(), msg: nil, err: err})
174191
}
175192
}
176193
}
@@ -189,7 +206,7 @@ func (c *channel) recvMsgs() {
189206
} else {
190207
c.streamMut.RUnlock()
191208
err := status.FromProto(resp.Metadata.GetStatus()).Err()
192-
c.routeResponse(resp.Metadata.MessageID, response{nid: c.nodeID, msg: resp.Message, err: err})
209+
c.routeResponse(resp.Metadata.MessageID, response{nid: c.node.ID(), msg: resp.Message, err: err})
193210
}
194211

195212
select {
@@ -200,11 +217,37 @@ func (c *channel) recvMsgs() {
200217
}
201218
}
202219

203-
func (c *channel) reconnect() {
220+
func (c *channel) tryReconnect() {
221+
// a connection has never been established
222+
if !c.connEstablished {
223+
err := c.node.dial()
224+
if err != nil {
225+
c.streamBroken.set()
226+
return
227+
}
228+
err = c.tryConnect(c.node.conn)
229+
if err != nil {
230+
c.streamBroken.set()
231+
return
232+
}
233+
}
234+
// the node has previously been connected
235+
// but is now disconnected
236+
if c.streamBroken.get() {
237+
// try to reconnect only once
238+
c.reconnect(1)
239+
}
240+
}
241+
242+
func (c *channel) reconnect(maxRetries ...int) {
204243
c.streamMut.Lock()
205244
defer c.streamMut.Unlock()
206245
backoffCfg := c.backoffCfg
207246

247+
var maxretries float64 = -1
248+
if len(maxRetries) > 0 {
249+
maxretries = float64(maxRetries[0])
250+
}
208251
var retries float64
209252
for {
210253
var err error
@@ -217,6 +260,10 @@ func (c *channel) reconnect() {
217260
}
218261
c.cancelStream()
219262
c.setLastErr(err)
263+
if retries >= maxretries && maxretries > 0 {
264+
c.streamBroken.set()
265+
return
266+
}
220267
delay := float64(backoffCfg.BaseDelay)
221268
max := float64(backoffCfg.MaxDelay)
222269
for r := retries; delay < max && r > 0; r-- {
@@ -257,6 +304,10 @@ type atomicFlag struct {
257304
flag int32
258305
}
259306

307+
func (c *channel) isConnected() bool {
308+
return c.connEstablished && !c.streamBroken.get()
309+
}
310+
260311
func (f *atomicFlag) set() { atomic.StoreInt32(&f.flag, 1) }
261312
func (f *atomicFlag) get() bool { return atomic.LoadInt32(&f.flag) == 1 }
262313
func (f *atomicFlag) clear() { atomic.StoreInt32(&f.flag, 0) }

gorums.pb.go

+13-13
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mgr.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ func (m *RawManager) AddNode(node *RawNode) error {
119119
m.logger.Printf("connecting to %s with id %d\n", node, node.id)
120120
}
121121
if err := node.connect(m); err != nil {
122-
return fmt.Errorf("connection failed for %s: %w", node, err)
122+
if m.logger != nil {
123+
m.logger.Println(fmt.Errorf("connection failed for %s: %w. will retry later.", node, err))
124+
}
123125
}
124126

125127
m.mu.Lock()

node.go

+32-12
Original file line numberDiff line numberDiff line change
@@ -61,28 +61,48 @@ func NewRawNodeWithID(addr string, id uint32) (*RawNode, error) {
6161
// connect to this node and associate it with the manager.
6262
func (n *RawNode) connect(mgr *RawManager) error {
6363
n.mgr = mgr
64+
n.channel = newChannel(n)
6465
if n.mgr.opts.noConnect {
6566
return nil
6667
}
67-
n.channel = newChannel(n)
68-
var err error
69-
ctx, cancel := context.WithTimeout(context.Background(), n.mgr.opts.nodeDialTimeout)
70-
defer cancel()
71-
n.conn, err = grpc.DialContext(ctx, n.addr, n.mgr.opts.grpcDialOpts...)
72-
if err != nil {
73-
return fmt.Errorf("dialing node failed: %w", err)
68+
// ignoring the error because it will try to reconnect
69+
// at a later time.
70+
_ = n.dial()
71+
ctx := n.ctxSetup()
72+
if err := n.channel.connect(ctx, n.conn); err != nil {
73+
return fmt.Errorf("starting stream failed: %w", err)
74+
}
75+
return nil
76+
}
77+
78+
// dials the node if it has not been done previously
79+
func (n *RawNode) dial() error {
80+
if n.conn == nil {
81+
var err error
82+
ctx, cancel := context.WithTimeout(context.Background(), n.mgr.opts.nodeDialTimeout)
83+
defer cancel()
84+
// error is ignored because we will retry the dial at a later time
85+
n.conn, err = grpc.DialContext(ctx, n.addr, n.mgr.opts.grpcDialOpts...)
86+
return err
7487
}
88+
return nil
89+
}
90+
91+
// creates a context that governs the channel. It is
92+
// used to stop all channel goroutines and the NodeStream.
93+
//
94+
// this method should be run for each connection to ensure
95+
// fresh contexts. Reusing contexts could result in reusing
96+
// a cancelled context.
97+
func (n *RawNode) ctxSetup() context.Context {
7598
md := n.mgr.opts.metadata.Copy()
7699
if n.mgr.opts.perNodeMD != nil {
77100
md = metadata.Join(md, n.mgr.opts.perNodeMD(n.id))
78101
}
79-
// a context for all of the streams
102+
var ctx context.Context
80103
ctx, n.cancel = context.WithCancel(context.Background())
81104
ctx = metadata.NewOutgoingContext(ctx, md)
82-
if err = n.channel.connect(ctx, n.conn); err != nil {
83-
return fmt.Errorf("starting stream failed: %w", err)
84-
}
85-
return nil
105+
return ctx
86106
}
87107

88108
// close this node.

opts.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ type managerOptions struct {
2222

2323
func newManagerOptions() managerOptions {
2424
return managerOptions{
25-
backoff: backoff.DefaultConfig,
26-
sendBuffer: 0,
25+
backoff: backoff.DefaultConfig,
26+
sendBuffer: 0,
27+
nodeDialTimeout: 50 * time.Millisecond,
2728
}
2829
}
2930

0 commit comments

Comments
 (0)