Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 419c21d

Browse files
utkuozdemirUnix4ever
andcommittedJan 30, 2025··
fix: fix node resolution cache for nodes in maintenance mode
There was a problem with the node resolution (a.k.a. DNS) cache of the nodes. When a machine is in maintenance mode, there is a corresponding `MachineStatus` resource for it, but there isn't any `ClusterMachineIdentity`. Both of these types trigger updates in the node resolution cache. When a machine was never part of a cluster, the only source is `MachineStatus`, and the cache updates on it did not populate the machine ID in the cache. This caused the GRPC router to pick the wrong destination. Furthermore, we did not remove the cluster and node name information from the cache when a machine was removed from a cluster. This caused the cache to contain obsolete cluster information, causing Talos GRPC proxy to not proxy the requests correctly after a machine was removed from a cluster. Co-authored-by: Artem Chernyshev <artem.chernyshev@talos-systems.com> Signed-off-by: Utku Ozdemir <utku.ozdemir@siderolabs.com>
1 parent 65244f6 commit 419c21d

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed
 

‎internal/backend/dns/service.go

+3
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ func (d *Service) updateEntryByMachineStatus(res *omni.MachineStatus) {
281281

282282
info := d.machineIDToInfo[res.Metadata().ID()]
283283

284+
info.ID = res.Metadata().ID()
284285
info.TalosVersion = version
285286
info.managementEndpoint = res.TypedSpec().Value.ManagementAddress
286287

@@ -316,6 +317,8 @@ func (d *Service) deleteIdentityMappings(id resource.ID) {
316317
d.nodenameToID.remove(info.Name, id)
317318
}
318319

320+
info.Cluster = ""
321+
info.Name = ""
319322
info.address = ""
320323

321324
d.machineIDToInfo[id] = info

‎internal/backend/dns/service_test.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ func (suite *ServiceSuite) TestResolve() {
115115
// destroy the identity, assert that it doesn't resolve anymore
116116
suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata()))
117117

118-
expected = dns.NewInfo(cluster, "test-1", "test-1-node", "")
118+
expected = dns.NewInfo("", "test-1", "", "")
119119
expected.TalosVersion = machineStatus.TypedSpec().Value.TalosVersion
120120

121121
// still resolves by the node id, but has an empty address
@@ -129,6 +129,22 @@ func (suite *ServiceSuite) TestResolve() {
129129
suite.assertResolve("test-1", zeroInfo)
130130
}
131131

132+
func (suite *ServiceSuite) TestResolveByMachineStatus() {
133+
expected := dns.NewInfo("", "test-1", "", "")
134+
135+
expected.TalosVersion = "3.2.1"
136+
137+
// In the maintenance mode, we only have MachineStatus, so we start with that
138+
// (means cache will be initialized with the data on MachineStatus and nothing else - no ClusterMachineIdentity)
139+
machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "test-1")
140+
141+
machineStatus.TypedSpec().Value.TalosVersion = "3.2.1"
142+
143+
suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus))
144+
145+
suite.assertResolve("test-1", expected)
146+
}
147+
132148
func (suite *ServiceSuite) assertResolveAddress(cluster, node, expected string) {
133149
err := retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).RetryWithContext(suite.ctx, func(context.Context) error {
134150
resolved := suite.dnsService.Resolve(cluster, node)

0 commit comments

Comments
 (0)
Please sign in to comment.