fix: fix node resolution cache for nodes in maintenance mode

utkuozdemir · Unix4ever · utkuozdemir · commit 419c21d23d29 · 2025-01-30T15:51:18.000+01:00
There was a problem with the node resolution (a.k.a. DNS) cache of the nodes.
When a machine is in maintenance mode, there is a corresponding `MachineStatus` resource for it, but there isn't any `ClusterMachineIdentity`.
Both of these types trigger updates in the node resolution cache.
When a machine was never part of a cluster, the only source is `MachineStatus`, and the cache updates on it did not populate the machine ID in the cache.
This caused the GRPC router to pick the wrong destination.

Furthermore, we did not remove the cluster and node name information from the cache when a machine was removed from a cluster. This caused the cache to contain obsolete cluster information, causing Talos GRPC proxy to not proxy the requests correctly after a machine was removed from a cluster.

Co-authored-by: Artem Chernyshev &lt;artem.chernyshev@talos-systems.com&gt;
Signed-off-by: Utku Ozdemir &lt;utku.ozdemir@siderolabs.com&gt;
diff --git a/internal/backend/dns/service.go b/internal/backend/dns/service.go
@@ -281,6 +281,7 @@ func (d *Service) updateEntryByMachineStatus(res *omni.MachineStatus) {
 
 	info := d.machineIDToInfo[res.Metadata().ID()]
 
+	info.ID = res.Metadata().ID()
 	info.TalosVersion = version
 	info.managementEndpoint = res.TypedSpec().Value.ManagementAddress
 
@@ -316,6 +317,8 @@ func (d *Service) deleteIdentityMappings(id resource.ID) {
 		d.nodenameToID.remove(info.Name, id)
 	}
 
+	info.Cluster = ""
+	info.Name = ""
 	info.address = ""
 
 	d.machineIDToInfo[id] = info
diff --git a/internal/backend/dns/service_test.go b/internal/backend/dns/service_test.go
@@ -115,7 +115,7 @@ func (suite *ServiceSuite) TestResolve() {
 	// destroy the identity, assert that it doesn't resolve anymore
 	suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata()))
 
-	expected = dns.NewInfo(cluster, "test-1", "test-1-node", "")
+	expected = dns.NewInfo("", "test-1", "", "")
 	expected.TalosVersion = machineStatus.TypedSpec().Value.TalosVersion
 
 	// still resolves by the node id, but has an empty address
@@ -129,6 +129,22 @@ func (suite *ServiceSuite) TestResolve() {
 	suite.assertResolve("test-1", zeroInfo)
 }
 
+func (suite *ServiceSuite) TestResolveByMachineStatus() {
+	expected := dns.NewInfo("", "test-1", "", "")
+
+	expected.TalosVersion = "3.2.1"
+
+	// In the maintenance mode, we only have MachineStatus, so we start with that
+	// (means cache will be initialized with the data on MachineStatus and nothing else - no ClusterMachineIdentity)
+	machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "test-1")
+
+	machineStatus.TypedSpec().Value.TalosVersion = "3.2.1"
+
+	suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus))
+
+	suite.assertResolve("test-1", expected)
+}
+
 func (suite *ServiceSuite) assertResolveAddress(cluster, node, expected string) {
 	err := retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).RetryWithContext(suite.ctx, func(context.Context) error {
 		resolved := suite.dnsService.Resolve(cluster, node)