Skip to content

Commit cd6e3dd

Browse files
committed
fix: retry failed connections and stop gracefully on error
1 parent 11feab2 commit cd6e3dd

File tree

2 files changed

+50
-13
lines changed

2 files changed

+50
-13
lines changed

lib/gateway/connection.ex

+8-1
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,8 @@ defmodule Crux.Gateway.Connection do
387387
end
388388
end
389389

390-
def terminate(:fatal_close_code, _state, data) do
390+
def terminate(reason, _state, data)
391+
when reason in [:fatal_close_code, :connection_failure] do
391392
Gateway.stop_shard(data.name, {data.shard_id, data.shard_count})
392393
end
393394

@@ -472,6 +473,12 @@ defmodule Crux.Gateway.Connection do
472473
{:next_state, @disconnected, data, :postpone}
473474
end
474475

476+
defp handle_common(_state, :info, {:EXIT, conn, :connection_failure}, %{conn: conn}) do
477+
Logger.error(fn -> "Connection failed to establish connection, shutting down." end)
478+
479+
{:stop, :connection_failure}
480+
end
481+
475482
defp handle_packet(packet, data)
476483

477484
defp handle_packet(%{op: @heartbeat}, %{conn: conn, seq: seq} = data) do

lib/gateway/connection/gun.ex

+42-12
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ defmodule Crux.Gateway.Connection.Gun do
8484
@connecting :connecting
8585
@connected :connected
8686

87+
@attempt_limit 5
88+
8789
defstruct [
8890
:parent,
8991
:host,
@@ -93,6 +95,7 @@ defmodule Crux.Gateway.Connection.Gun do
9395
:zlib,
9496
:buffer,
9597
:conn,
98+
:attempts,
9699
:expect_disconnect
97100
]
98101

@@ -109,6 +112,8 @@ defmodule Crux.Gateway.Connection.Gun do
109112
buffer: binary(),
110113
# WS connection wrapper process
111114
conn: pid() | nil,
115+
# Limit the amount of attempts to establish a connection
116+
attempts: non_neg_integer(),
112117
# Whether we are expecting a gun_down / disconnect
113118
# and do not want to notify the spawning process again
114119
expect_disconnect: boolean()
@@ -136,6 +141,7 @@ defmodule Crux.Gateway.Connection.Gun do
136141
zlib: nil,
137142
buffer: <<>>,
138143
conn: nil,
144+
attempts: 0,
139145
expect_disconnect: false
140146
}
141147

@@ -184,30 +190,54 @@ defmodule Crux.Gateway.Connection.Gun do
184190
z = Zlib.open()
185191
Zlib.inflateInit(z)
186192

187-
Logger.debug(fn -> "Starting a process to connect to #{data.host}:#{data.port}" end)
193+
attempts = data.attempts + 1
194+
data = %{data | attempts: attempts}
195+
196+
Logger.debug(fn ->
197+
"Starting a process to connect to #{data.host}:#{data.port} (Attempt: #{attempts} / #{@attempt_limit})"
198+
end)
188199

189200
# > Gun does not currently support Websocket over HTTP/2.
190201
{:ok, conn} = Gun.open(data.host, data.port, %{protocols: [:http]})
191202

192203
Logger.debug(fn -> "Process started, waiting for its connection to be up." end)
193204

194-
{:ok, :http} = Gun.await_up(conn)
205+
conn
206+
|> Gun.await_up()
207+
|> case do
208+
{:ok, :http} ->
209+
Logger.debug(fn ->
210+
"Connection is up, now upgrading it to use the WebSocket protocol, using " <>
211+
data.path <> data.query
212+
end)
195213

196-
Logger.debug(fn ->
197-
"Connection is up, now upgrading it to use the WebSocket protocol, using " <>
198-
data.path <> data.query
199-
end)
214+
stream_ref = Gun.ws_upgrade(conn, data.path <> data.query)
215+
:ok = await_upgrade(conn, stream_ref)
216+
217+
Logger.debug(fn ->
218+
"Connection upgraded to use the WebSocket protocol, we are good to go!"
219+
end)
220+
221+
send_connected(data)
200222

201-
stream_ref = Gun.ws_upgrade(conn, data.path <> data.query)
202-
:ok = await_upgrade(conn, stream_ref)
223+
data = %{data | conn: conn, zlib: z, attempts: 0}
203224

204-
Logger.debug(fn -> "Connection upgraded to use the WebSocket protocol, we are good to go!" end)
225+
{:keep_state, data, {:timeout, 0, :connected}}
205226

206-
send_connected(data)
227+
{:error, :timeout} when attempts >= @attempt_limit ->
228+
Logger.error(fn ->
229+
"Connection timed out, no attempts remaining, won't retry. (#{attempts} / #{@attempt_limit})"
230+
end)
207231

208-
data = %{data | conn: conn, zlib: z}
232+
{:stop, :connection_failure, data}
209233

210-
{:keep_state, data, {:timeout, 0, :connected}}
234+
{:error, :timeout} ->
235+
Logger.warn(fn ->
236+
"Connection timed out, will retry. (#{attempts} / #{@attempt_limit})"
237+
end)
238+
239+
{:repeat_state, data}
240+
end
211241
end
212242

213243
def connecting(:timeout, :connected, data) do

0 commit comments

Comments
 (0)