Skip to content

Commit 1ce9c82

Browse files
authored
Merge pull request #4911 from mn-ram/fix/ga-sock-reconnect
hostagent: stop destroying ga.sock on guest-agent reconnect
2 parents 2255df1 + 9791149 commit 1ce9c82

2 files changed

Lines changed: 196 additions & 6 deletions

File tree

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// SPDX-FileCopyrightText: Copyright The Lima Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package hostagent
5+
6+
import (
7+
"context"
8+
"sync"
9+
"testing"
10+
"time"
11+
12+
"github.com/lima-vm/sshocker/pkg/ssh"
13+
"gotest.tools/v3/assert"
14+
)
15+
16+
// Regression test for https://github.com/lima-vm/lima/issues/2227.
17+
//
18+
// Two invariants must hold for the guest-agent socket forward:
19+
// 1. forwardGuestAgentSock must issue verbCancel before verbForward on every
20+
// call, so the SSH ControlMaster releases the prior registration and the
21+
// new bind succeeds cleanly. Otherwise forwardSSH unlinks the local
22+
// ga.sock and the socket disappears from disk until limactl stop/start.
23+
// 2. gaSockForwardMu must serialize the forward path against the cancel
24+
// path (cleanup) and against concurrent reconnect ticks. Without
25+
// serialization, os.RemoveAll/bind races leave ga.sock missing.
26+
27+
type forwardCall struct {
28+
verb string
29+
local string
30+
}
31+
32+
// stubForwardSSH replaces the package-level forwardSSH with a stub that
33+
// records calls and optionally runs hook on every call. It returns an
34+
// accessor for the recorded calls and a restore function.
35+
func stubForwardSSH(t *testing.T, hook func(verb string)) (calls func() []forwardCall, restore func()) {
36+
t.Helper()
37+
var mu sync.Mutex
38+
var recorded []forwardCall
39+
orig := forwardSSH
40+
forwardSSH = func(_ context.Context, _ *ssh.SSHConfig, _ string, _ int, local, _, verb string, _ bool) error {
41+
if hook != nil {
42+
hook(verb)
43+
}
44+
mu.Lock()
45+
recorded = append(recorded, forwardCall{verb: verb, local: local})
46+
mu.Unlock()
47+
return nil
48+
}
49+
return func() []forwardCall {
50+
mu.Lock()
51+
defer mu.Unlock()
52+
out := make([]forwardCall, len(recorded))
53+
copy(out, recorded)
54+
return out
55+
}, func() {
56+
forwardSSH = orig
57+
}
58+
}
59+
60+
func TestForwardGuestAgentSock_CancelsBeforeForward(t *testing.T) {
61+
calls, restore := stubForwardSSH(t, nil)
62+
defer restore()
63+
64+
a := &HostAgent{}
65+
a.forwardGuestAgentSock(t.Context(), "/tmp/ga.sock", "/run/lima-guestagent.sock")
66+
67+
got := calls()
68+
assert.Equal(t, len(got), 2, "expected exactly one cancel and one forward")
69+
assert.Equal(t, got[0].verb, verbCancel, "cancel must come before forward")
70+
assert.Equal(t, got[1].verb, verbForward, "forward must follow cancel")
71+
}
72+
73+
func TestCancelGuestAgentSockForward_IssuesCancel(t *testing.T) {
74+
calls, restore := stubForwardSSH(t, nil)
75+
defer restore()
76+
77+
a := &HostAgent{}
78+
err := a.cancelGuestAgentSockForward(t.Context(), "/tmp/ga.sock", "/run/lima-guestagent.sock")
79+
assert.NilError(t, err)
80+
81+
got := calls()
82+
assert.Equal(t, len(got), 1)
83+
assert.Equal(t, got[0].verb, verbCancel)
84+
}
85+
86+
// TestGuestAgentSockForward_SerializedUnderContention runs many concurrent
87+
// forward and cancel callers and asserts that at most one forwardSSH call is
88+
// in flight at any moment (max observed concurrency == 1). If
89+
// gaSockForwardMu were missing or scoped incorrectly, two goroutines could
90+
// enter forwardSSH simultaneously and race on the local ga.sock listener.
91+
func TestGuestAgentSockForward_SerializedUnderContention(t *testing.T) {
92+
const (
93+
forwarders = 8
94+
cancelers = 8
95+
iterations = 50
96+
)
97+
98+
var (
99+
mu sync.Mutex
100+
active int
101+
maxActive int
102+
)
103+
hook := func(_ string) {
104+
mu.Lock()
105+
active++
106+
if active > maxActive {
107+
maxActive = active
108+
}
109+
mu.Unlock()
110+
// Widen the window so any missing serialization can manifest.
111+
time.Sleep(50 * time.Microsecond)
112+
mu.Lock()
113+
active--
114+
mu.Unlock()
115+
}
116+
117+
_, restore := stubForwardSSH(t, hook)
118+
defer restore()
119+
120+
a := &HostAgent{}
121+
var wg sync.WaitGroup
122+
for range forwarders {
123+
wg.Go(func() {
124+
for range iterations {
125+
a.forwardGuestAgentSock(t.Context(), "/tmp/ga.sock", "/run/lima-guestagent.sock")
126+
}
127+
})
128+
}
129+
for range cancelers {
130+
wg.Go(func() {
131+
for range iterations {
132+
_ = a.cancelGuestAgentSockForward(t.Context(), "/tmp/ga.sock", "/run/lima-guestagent.sock")
133+
}
134+
})
135+
}
136+
wg.Wait()
137+
138+
mu.Lock()
139+
defer mu.Unlock()
140+
assert.Equal(t, maxActive, 1, "forwardSSH calls overlapped — gaSockForwardMu is not serializing the ga.sock forward path")
141+
}

pkg/hostagent/hostagent.go

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ type HostAgent struct {
7878
clientMu sync.RWMutex
7979
client *guestagentclient.GuestAgentClient
8080

81+
// gaSockForwardMu serializes (re-)establishment of the SSH forward for
82+
// the guest-agent unix socket. The reconnect loop in watchGuestAgentEvents
83+
// and the inotify goroutine both touch the same local socket path; without
84+
// this lock they can race on os.RemoveAll/bind and leave ga.sock missing.
85+
gaSockForwardMu sync.Mutex
86+
8187
guestAgentAliveCh chan struct{} // closed on establishing the connection
8288
guestAgentAliveChOnce sync.Once
8389

@@ -719,7 +725,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
719725
}
720726
}
721727
if a.driver.ForwardGuestAgent(ctx) {
722-
if err := forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbCancel, false); err != nil {
728+
if err := a.cancelGuestAgentSockForward(ctx, localUnix, remoteUnix); err != nil {
723729
errs = append(errs, err)
724730
}
725731
}
@@ -733,8 +739,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
733739
client := a.getClient()
734740
if client == nil || !isGuestAgentSocketAccessible(ctx, client) {
735741
if a.driver.ForwardGuestAgent(ctx) {
736-
sshAddress, sshPort := a.sshAddressPort()
737-
_ = forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbForward, false)
742+
a.forwardGuestAgentSock(ctx, localUnix, remoteUnix)
738743
}
739744
}
740745
// Re-spawn startInotify when its gRPC stream dies (typically because
@@ -769,8 +774,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
769774
client := a.getClient()
770775
if client == nil || !isGuestAgentSocketAccessible(ctx, client) {
771776
if a.driver.ForwardGuestAgent(ctx) {
772-
sshAddress, sshPort := a.sshAddressPort()
773-
_ = forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbForward, false)
777+
a.forwardGuestAgentSock(ctx, localUnix, remoteUnix)
774778
}
775779
}
776780
client, err := a.getOrCreateClient(ctx)
@@ -949,7 +953,52 @@ func executeSSH(ctx context.Context, sshConfig *ssh.SSHConfig, sshAddress string
949953
return nil
950954
}
951955

952-
func forwardSSH(ctx context.Context, sshConfig *ssh.SSHConfig, sshAddress string, sshPort int, local, remote, verb string, reverse bool) error {
956+
// forwardGuestAgentSock establishes (or re-establishes) the SSH local forward
957+
// of the guest-agent unix socket. It is used both for the initial setup and
958+
// to bring the forward back up after the guest agent has been restarted, the
959+
// VM has been rebooted, or the gRPC stream has otherwise become unhealthy.
960+
//
961+
// The previous behavior was to call forwardSSH(verbForward) directly on every
962+
// reconnect tick. forwardSSH unlinks the local socket file as its first step
963+
// (so a fresh listener can bind), and the SSH ControlMaster still has the
964+
// previous forward registered for the same listen path. The duplicate
965+
// registration causes ssh -O forward to exit non-zero and forwardSSH to unlink
966+
// the socket a second time on its failure branch — leaving ga.sock permanently
967+
// missing on disk and breaking host↔guest gRPC, dynamic port forwarding, and
968+
// inotify mount invalidation until limactl stop && limactl start. See #2227.
969+
//
970+
// The fix is twofold:
971+
// 1. Best-effort verbCancel before verbForward, so the ControlMaster releases
972+
// the prior registration and the new bind succeeds cleanly.
973+
// 2. Serialize via gaSockForwardMu, so the reconnect loop in
974+
// watchGuestAgentEvents, the inotify setup goroutine, and the cleanup
975+
// path cannot race on os.RemoveAll/bind of the same path.
976+
func (a *HostAgent) forwardGuestAgentSock(ctx context.Context, localUnix, remoteUnix string) {
977+
a.gaSockForwardMu.Lock()
978+
defer a.gaSockForwardMu.Unlock()
979+
sshAddress, sshPort := a.sshAddressPort()
980+
// Best-effort teardown of any prior forward registered with the
981+
// ControlMaster. Errors are expected (e.g. on the very first call when
982+
// no forward exists yet) and intentionally ignored. Use ctx so shutdown
983+
// can unblock this call if the ControlMaster is unresponsive.
984+
_ = forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbCancel, false)
985+
if err := forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbForward, false); err != nil {
986+
logrus.WithError(err).Warn("failed to (re-)establish forward for the guest agent socket; will retry")
987+
}
988+
}
989+
990+
// cancelGuestAgentSockForward tears down the SSH forward for the guest-agent
991+
// unix socket. Serialized via gaSockForwardMu so it cannot race with the
992+
// reconnect path in forwardGuestAgentSock.
993+
func (a *HostAgent) cancelGuestAgentSockForward(ctx context.Context, localUnix, remoteUnix string) error {
994+
a.gaSockForwardMu.Lock()
995+
defer a.gaSockForwardMu.Unlock()
996+
sshAddress, sshPort := a.sshAddressPort()
997+
return forwardSSH(ctx, a.sshConfig, sshAddress, sshPort, localUnix, remoteUnix, verbCancel, false)
998+
}
999+
1000+
// forwardSSH is a var (not a func) so tests can stub it without touching real ssh.
1001+
var forwardSSH = func(ctx context.Context, sshConfig *ssh.SSHConfig, sshAddress string, sshPort int, local, remote, verb string, reverse bool) error {
9531002
args := sshConfig.Args()
9541003
args = append(args,
9551004
"-T",

0 commit comments

Comments
 (0)