@@ -78,6 +78,12 @@ type HostAgent struct {
7878 clientMu sync.RWMutex
7979 client * guestagentclient.GuestAgentClient
8080
81+ // gaSockForwardMu serializes (re-)establishment of the SSH forward for
82+ // the guest-agent unix socket. The reconnect loop in watchGuestAgentEvents
83+ // and the inotify goroutine both touch the same local socket path; without
84+ // this lock they can race on os.RemoveAll/bind and leave ga.sock missing.
85+ gaSockForwardMu sync.Mutex
86+
8187 guestAgentAliveCh chan struct {} // closed on establishing the connection
8288 guestAgentAliveChOnce sync.Once
8389
@@ -719,7 +725,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
719725 }
720726 }
721727 if a .driver .ForwardGuestAgent (ctx ) {
722- if err := forwardSSH (ctx , a . sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbCancel , false ); err != nil {
728+ if err := a . cancelGuestAgentSockForward (ctx , localUnix , remoteUnix ); err != nil {
723729 errs = append (errs , err )
724730 }
725731 }
@@ -733,8 +739,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
733739 client := a .getClient ()
734740 if client == nil || ! isGuestAgentSocketAccessible (ctx , client ) {
735741 if a .driver .ForwardGuestAgent (ctx ) {
736- sshAddress , sshPort := a .sshAddressPort ()
737- _ = forwardSSH (ctx , a .sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbForward , false )
742+ a .forwardGuestAgentSock (ctx , localUnix , remoteUnix )
738743 }
739744 }
740745 // Re-spawn startInotify when its gRPC stream dies (typically because
@@ -769,8 +774,7 @@ func (a *HostAgent) watchGuestAgentEvents(ctx context.Context) {
769774 client := a .getClient ()
770775 if client == nil || ! isGuestAgentSocketAccessible (ctx , client ) {
771776 if a .driver .ForwardGuestAgent (ctx ) {
772- sshAddress , sshPort := a .sshAddressPort ()
773- _ = forwardSSH (ctx , a .sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbForward , false )
777+ a .forwardGuestAgentSock (ctx , localUnix , remoteUnix )
774778 }
775779 }
776780 client , err := a .getOrCreateClient (ctx )
@@ -949,7 +953,52 @@ func executeSSH(ctx context.Context, sshConfig *ssh.SSHConfig, sshAddress string
949953 return nil
950954}
951955
952- func forwardSSH (ctx context.Context , sshConfig * ssh.SSHConfig , sshAddress string , sshPort int , local , remote , verb string , reverse bool ) error {
956+ // forwardGuestAgentSock establishes (or re-establishes) the SSH local forward
957+ // of the guest-agent unix socket. It is used both for the initial setup and
958+ // to bring the forward back up after the guest agent has been restarted, the
959+ // VM has been rebooted, or the gRPC stream has otherwise become unhealthy.
960+ //
961+ // The previous behavior was to call forwardSSH(verbForward) directly on every
962+ // reconnect tick. forwardSSH unlinks the local socket file as its first step
963+ // (so a fresh listener can bind), and the SSH ControlMaster still has the
964+ // previous forward registered for the same listen path. The duplicate
965+ // registration causes ssh -O forward to exit non-zero and forwardSSH to unlink
966+ // the socket a second time on its failure branch — leaving ga.sock permanently
967+ // missing on disk and breaking host↔guest gRPC, dynamic port forwarding, and
968+ // inotify mount invalidation until limactl stop && limactl start. See #2227.
969+ //
970+ // The fix is twofold:
971+ // 1. Best-effort verbCancel before verbForward, so the ControlMaster releases
972+ // the prior registration and the new bind succeeds cleanly.
973+ // 2. Serialize via gaSockForwardMu, so the reconnect loop in
974+ // watchGuestAgentEvents, the inotify setup goroutine, and the cleanup
975+ // path cannot race on os.RemoveAll/bind of the same path.
976+ func (a * HostAgent ) forwardGuestAgentSock (ctx context.Context , localUnix , remoteUnix string ) {
977+ a .gaSockForwardMu .Lock ()
978+ defer a .gaSockForwardMu .Unlock ()
979+ sshAddress , sshPort := a .sshAddressPort ()
980+ // Best-effort teardown of any prior forward registered with the
981+ // ControlMaster. Errors are expected (e.g. on the very first call when
982+ // no forward exists yet) and intentionally ignored. Use ctx so shutdown
983+ // can unblock this call if the ControlMaster is unresponsive.
984+ _ = forwardSSH (ctx , a .sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbCancel , false )
985+ if err := forwardSSH (ctx , a .sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbForward , false ); err != nil {
986+ logrus .WithError (err ).Warn ("failed to (re-)establish forward for the guest agent socket; will retry" )
987+ }
988+ }
989+
990+ // cancelGuestAgentSockForward tears down the SSH forward for the guest-agent
991+ // unix socket. Serialized via gaSockForwardMu so it cannot race with the
992+ // reconnect path in forwardGuestAgentSock.
993+ func (a * HostAgent ) cancelGuestAgentSockForward (ctx context.Context , localUnix , remoteUnix string ) error {
994+ a .gaSockForwardMu .Lock ()
995+ defer a .gaSockForwardMu .Unlock ()
996+ sshAddress , sshPort := a .sshAddressPort ()
997+ return forwardSSH (ctx , a .sshConfig , sshAddress , sshPort , localUnix , remoteUnix , verbCancel , false )
998+ }
999+
1000+ // forwardSSH is a var (not a func) so tests can stub it without touching real ssh.
1001+ var forwardSSH = func (ctx context.Context , sshConfig * ssh.SSHConfig , sshAddress string , sshPort int , local , remote , verb string , reverse bool ) error {
9531002 args := sshConfig .Args ()
9541003 args = append (args ,
9551004 "-T" ,
0 commit comments