Skip to content

Commit 8f57eda

Browse files
committed
protocols: client: Add timeout for hybrid vsock handshake
When the client tries to connect sometimes a race condition could happen when the between bind and listen calls in the agent vsock side. This will block the hypervisor wanting for a response and as consequence the agent client where it checks for an OK response. This case needs to be handled by the guest kernel, see https://lore.kernel.org/netdev/[email protected]/ As an extra protection make the agent client timeout if no OK response is given. The response should be quick so is OK to wait a few seconds and then timeout. This also allow to return an error from the dialler function so retry does not fallback on grpc retry making retries faster. Fixes: kata-containers#372 Signed-off-by: Jose Carlos Venegas Munoz <[email protected]>
1 parent 36b37f6 commit 8f57eda

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

Diff for: protocols/client/client.go

+34-17
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package client
99
import (
1010
"bufio"
1111
"context"
12+
"errors"
1213
"fmt"
1314
"net"
1415
"net/url"
@@ -400,6 +401,7 @@ func HybridVSockDialer(sock string, timeout time.Duration) (net.Conn, error) {
400401
}
401402

402403
dialFunc := func() (net.Conn, error) {
404+
handshakeTimeout := 10 * time.Second
403405
conn, err := net.DialTimeout("unix", udsPath, timeout)
404406
if err != nil {
405407
return nil, err
@@ -418,26 +420,41 @@ func HybridVSockDialer(sock string, timeout time.Duration) (net.Conn, error) {
418420
return nil, err
419421
}
420422

421-
// A trivial handshake is included in the host-initiated vsock connection protocol.
422-
// It looks like this:
423-
// - [host] CONNECT <port><LF>
424-
// - [guest/success] OK <assigned_host_port><LF>
425-
reader := bufio.NewReader(conn)
426-
response, err := reader.ReadString('\n')
427-
if err != nil {
428-
conn.Close()
429-
agentClientLog.WithField("Error", err).Debug("HybridVsock trivial handshake failed")
430-
// for now, we temporarily rely on the backoff strategy from GRPC for more stable CI.
423+
errChan := make(chan error)
424+
425+
go func() {
426+
reader := bufio.NewReader(conn)
427+
response, err := reader.ReadString('\n')
428+
if err != nil {
429+
errChan <- err
430+
return
431+
}
432+
433+
agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake")
434+
435+
if strings.Contains(response, "OK") {
436+
errChan <- nil
437+
} else {
438+
errChan <- errors.New("HybridVsock trivial handshake failed with malformed response code")
439+
}
440+
}()
441+
442+
select {
443+
case err = <-errChan:
444+
if err != nil {
445+
conn.Close()
446+
agentClientLog.WithField("Error", err).Debug("HybridVsock trivial handshake failed")
447+
return nil, err
448+
449+
}
431450
return conn, nil
432-
} else if !strings.Contains(response, "OK") {
451+
case <-time.After(handshakeTimeout):
452+
// Timeout: kernel vsock implementation has a race condition, where no response is given
453+
// Instead of waiting forever for a response, timeout after a fair amount of time.
454+
// See: https://lore.kernel.org/netdev/[email protected]/
433455
conn.Close()
434-
agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake failed with malformd response code")
435-
// for now, we temporarily rely on the backoff strategy from GRPC for more stable CI.
436-
return conn, nil
456+
return nil, errors.New("timeout waiting for hybrid vsocket handshake")
437457
}
438-
agentClientLog.WithField("response", response).Debug("HybridVsock trivial handshake")
439-
440-
return conn, nil
441458
}
442459

443460
timeoutErr := grpcStatus.Errorf(codes.DeadlineExceeded, "timed out connecting to hybrid vsocket %s", sock)

0 commit comments

Comments
 (0)