Skip to content

Commit f842c77

Browse files
Merge pull request #395 from tnierman/osd-28525
OSD-28525 - Initial implementation for MachineHealthCheckUnterminatedShortCircuitSRE investigation
2 parents 7b0f2f0 + c1da1a1 commit f842c77

16 files changed

+1862
-4
lines changed

go.mod

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ require (
2222
github.com/prometheus/client_golang v1.22.0
2323
github.com/prometheus/common v0.63.0
2424
github.com/spf13/cobra v1.8.1
25-
github.com/stretchr/testify v1.10.0
2625
go.uber.org/mock v0.4.0
2726
go.uber.org/zap v1.27.0
2827
gopkg.in/yaml.v2 v2.4.0
2928
k8s.io/api v0.31.3
3029
k8s.io/apimachinery v0.31.3
30+
k8s.io/client-go v0.31.3
3131
sigs.k8s.io/controller-runtime v0.19.0
3232
)
3333

@@ -123,7 +123,6 @@ require (
123123
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
124124
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect
125125
github.com/pkg/errors v0.9.1 // indirect
126-
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
127126
github.com/prometheus/client_model v0.6.1 // indirect
128127
github.com/prometheus/procfs v0.15.1 // indirect
129128
github.com/sagikazarmark/locafero v0.7.0 // indirect
@@ -155,7 +154,6 @@ require (
155154
gopkg.in/ini.v1 v1.67.0 // indirect
156155
gopkg.in/yaml.v3 v3.0.1 // indirect
157156
k8s.io/cli-runtime v0.30.3 // indirect
158-
k8s.io/client-go v0.31.3 // indirect
159157
k8s.io/klog/v2 v2.130.1 // indirect
160158
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
161159
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
/*
2+
machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert
3+
*/
4+
package machinehealthcheckunterminatedshortcircuitsre
5+
6+
import (
7+
"context"
8+
"errors"
9+
"fmt"
10+
"slices"
11+
"strings"
12+
"time"
13+
14+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
15+
16+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
17+
machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
18+
nodeutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node"
19+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
20+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
21+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
22+
23+
corev1 "k8s.io/api/core/v1"
24+
"sigs.k8s.io/controller-runtime/pkg/client"
25+
)
26+
27+
const (
28+
alertname = "MachineHealthCheckUnterminatedShortCircuitSRE"
29+
// remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api
30+
remediationName = "machineHealthCheckUnterminatedShortCircuitSRE"
31+
)
32+
33+
type Investigation struct {
34+
// kclient provides access to on-cluster resources
35+
kclient client.Client
36+
// notes holds the messages that will be shared with Primary upon completion
37+
notes *notewriter.NoteWriter
38+
// recommendations holds the set of actions CAD recommends primary to take
39+
recommendations investigationRecommendations
40+
}
41+
42+
func (i *Investigation) setup(r *investigation.Resources) error {
43+
// Setup investigation
44+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
45+
if err != nil {
46+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
47+
}
48+
i.kclient = k
49+
i.notes = notewriter.New(r.Name, logging.RawLogger)
50+
i.recommendations = investigationRecommendations{}
51+
52+
return nil
53+
}
54+
55+
// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert
56+
//
57+
// The investigation seeks to provide exactly one recommended action per affected machine/node pair.
58+
// The machine object is evaluated first, as it represents a lower-level object that could affect the health of the node.
59+
// If the investigation determines that the breakage is occurring at the machine-level, the corresponding node is *not* investigated.
60+
// After investigating all affected machines, potentially affected nodes are investigated.
61+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
62+
ctx := context.Background()
63+
result := investigation.InvestigationResult{}
64+
65+
// Setup & teardown
66+
err := i.setup(r)
67+
if err != nil {
68+
return result, fmt.Errorf("failed to setup investigation: %w", err)
69+
}
70+
defer func(r *investigation.Resources) {
71+
err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
72+
if err != nil {
73+
logging.Errorf("failed to cleanup investigation: %w", err)
74+
}
75+
}(r)
76+
77+
targetMachines, err := i.getMachinesFromFailingMHC(ctx)
78+
if err != nil {
79+
i.notes.AppendWarning("failed to retrieve one or more target machines: %v", err)
80+
}
81+
if len(targetMachines) == 0 {
82+
i.notes.AppendWarning("no machines found for short-circuited machinehealthcheck objects")
83+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
84+
}
85+
86+
problemMachines, err := i.InvestigateMachines(ctx, targetMachines)
87+
if err != nil {
88+
i.notes.AppendWarning("failed to investigate machines: %v", err)
89+
}
90+
91+
// Trim out the machines that we've already investigated and know have problems
92+
//
93+
// The purpose of this is to avoid re-investigating nodes whose machines were already investigated. Any node-level issue on a failing machine
94+
// is most likely related to the machine itself, and providing duplicate/conflicting advice will only prove confusing to the responder
95+
targetMachines = slices.DeleteFunc(targetMachines, func(targetMachine machinev1beta1.Machine) bool {
96+
for _, problemMachine := range problemMachines {
97+
if problemMachine.Name == targetMachine.Name {
98+
return true
99+
}
100+
}
101+
return false
102+
})
103+
104+
// If one or more machines managed by the machinehealthcheck have not yet been identified as a problem, check on the machine's
105+
// node to determine if there are node-level problems that need remediating
106+
if len(targetMachines) > 0 {
107+
targetNodes, err := machineutil.GetNodesForMachines(ctx, i.kclient, targetMachines)
108+
if err != nil {
109+
i.notes.AppendWarning("failed to retrieve one or more target nodes: %v", err)
110+
}
111+
if len(targetNodes) > 0 {
112+
i.InvestigateNodes(targetNodes)
113+
}
114+
}
115+
116+
// Summarize recommendations from investigation in PD notes, if any found
117+
if len(i.recommendations) > 0 {
118+
i.notes.AppendWarning(i.recommendations.summarize())
119+
} else {
120+
i.notes.AppendSuccess("no recommended actions to take against cluster")
121+
}
122+
123+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
124+
}
125+
126+
func (i *Investigation) getMachinesFromFailingMHC(ctx context.Context) ([]machinev1beta1.Machine, error) {
127+
healthchecks := machinev1beta1.MachineHealthCheckList{}
128+
err := i.kclient.List(ctx, &healthchecks, &client.ListOptions{Namespace: machineutil.MachineNamespace})
129+
if err != nil {
130+
return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machinehealthchecks from cluster: %w", err)
131+
}
132+
133+
targets := []machinev1beta1.Machine{}
134+
for _, healthcheck := range healthchecks.Items {
135+
if !machineutil.HealthcheckRemediationAllowed(healthcheck) {
136+
machines, err := machineutil.GetMachinesForMHC(ctx, i.kclient, healthcheck)
137+
if err != nil {
138+
i.notes.AppendWarning("failed to retrieve machines from machinehealthcheck %q: %v", healthcheck.Name, err)
139+
continue
140+
}
141+
targets = append(targets, machines...)
142+
}
143+
}
144+
145+
return targets, nil
146+
}
147+
148+
// InvestigateMachines evaluates the state of the machines in the cluster and returns a list of the failing machines, along with a categorized set of recommendations based on the failure state of
149+
// each machine
150+
func (i *Investigation) InvestigateMachines(ctx context.Context, targets []machinev1beta1.Machine) ([]machinev1beta1.Machine, error) {
151+
investigatedMachines := []machinev1beta1.Machine{}
152+
var errs error
153+
for _, machine := range targets {
154+
if machine.DeletionTimestamp != nil {
155+
err := i.investigateDeletingMachine(ctx, machine)
156+
investigatedMachines = append(investigatedMachines, machine)
157+
if err != nil {
158+
errs = errors.Join(errs, fmt.Errorf("failed to investigate deleting machine: %w", err))
159+
}
160+
continue
161+
}
162+
163+
if (machine.Status.Phase != nil && *machine.Status.Phase == machinev1beta1.PhaseFailed) || machine.Status.ErrorReason != nil {
164+
err := i.investigateFailingMachine(machine)
165+
investigatedMachines = append(investigatedMachines, machine)
166+
if err != nil {
167+
errs = errors.Join(errs, fmt.Errorf("failed to investigate failing machine: %w", err))
168+
}
169+
}
170+
}
171+
172+
if len(investigatedMachines) == 0 {
173+
i.notes.AppendSuccess("no deleting or Failed machines found")
174+
}
175+
return investigatedMachines, errs
176+
}
177+
178+
// investigateFailingMachin evaluates a machine whose .Status.Phase is failed, and provides a recommendation based on the cause of the failure
179+
func (i *Investigation) investigateFailingMachine(machine machinev1beta1.Machine) error {
180+
role, err := machineutil.GetRole(machine)
181+
if err != nil {
182+
// Failing to determine whether a machine is Red Hat-managed warrants manual investigation
183+
notes := fmt.Sprintf("unable to determine machine role: %v", err)
184+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
185+
return fmt.Errorf("failed to determine role for machine %q: %w", machine.Name, err)
186+
}
187+
188+
// Safely dereference status fields to avoid panics in cases where machines' statuses haven't been fully populated
189+
var errorMsg string
190+
if machine.Status.ErrorMessage != nil {
191+
errorMsg = *machine.Status.ErrorMessage
192+
}
193+
194+
var errorReason machinev1beta1.MachineStatusError
195+
if machine.Status.ErrorReason != nil {
196+
errorReason = *machine.Status.ErrorReason
197+
}
198+
199+
if role != machineutil.WorkerRoleLabelValue {
200+
// If machine is Red-Hat owned, always require manual investigation
201+
notes := fmt.Sprintf("Red Hat-owned machine in state %q due to %q", errorReason, errorMsg)
202+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
203+
return nil
204+
}
205+
206+
switch errorReason {
207+
case machinev1beta1.IPAddressInvalidReason:
208+
notes := fmt.Sprintf("invalid IP address: %q. Deleting machine may allow the cloud provider to assign a valid IP address", errorMsg)
209+
i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes)
210+
211+
case machinev1beta1.CreateMachineError:
212+
notes := fmt.Sprintf("machine failed to create: %q. Deleting machine may resolve any transient issues with the cloud provider", errorMsg)
213+
i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes)
214+
215+
case machinev1beta1.InvalidConfigurationMachineError:
216+
notes := fmt.Sprintf("the machine configuration is invalid: %q. Checking splunk audit logs may indicate whether the customer has modified the machine or its machineset", errorMsg)
217+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
218+
219+
case machinev1beta1.DeleteMachineError:
220+
notes := fmt.Sprintf("the machine's node could not be gracefully terminated automatically: %q", errorMsg)
221+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
222+
223+
case machinev1beta1.InsufficientResourcesMachineError:
224+
notes := fmt.Sprintf("a servicelog should be sent because there is insufficient quota to provision the machine: %q", errorMsg)
225+
i.recommendations.addRecommendation(recommendationQuotaServiceLog, machine.Name, notes)
226+
227+
default:
228+
notes := "no .Status.ErrorReason found for machine"
229+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
230+
}
231+
return nil
232+
}
233+
234+
// InvestigateDeletingMachines evaluates machines which are being deleted, to determine if & why they are blocked, along with a recommendation on how to unblock them
235+
func (i *Investigation) investigateDeletingMachine(ctx context.Context, machine machinev1beta1.Machine) error {
236+
if machine.Status.NodeRef == nil {
237+
notes := "machine stuck deleting with no node"
238+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
239+
return nil
240+
}
241+
node, err := machineutil.GetNodeForMachine(ctx, i.kclient, machine)
242+
if err != nil {
243+
notes := "machine's node could not be determined"
244+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
245+
return fmt.Errorf("failed to retrieve node for machine %q: %w", machine.Name, err)
246+
}
247+
248+
stuck, duration := checkForStuckDrain(node)
249+
if stuck {
250+
notes := fmt.Sprintf("node %q found to be stuck draining for %s", node.Name, duration.Truncate(time.Second).String())
251+
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
252+
return nil
253+
}
254+
255+
notes := "unable to determine why machine is stuck deleting"
256+
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
257+
return nil
258+
}
259+
260+
// checkForStuckDrain makes a best-effort approximation at whether a node is stuck draining a specific pod
261+
func checkForStuckDrain(node corev1.Node) (bool, *time.Duration) {
262+
if len(node.Spec.Taints) == 0 {
263+
return false, nil
264+
}
265+
266+
taint, found := nodeutil.FindNoScheduleTaint(node)
267+
if !found {
268+
return false, nil
269+
}
270+
271+
// TODO - Once CAD can access on-cluster metrics, we can query the `pods_preventing_node_drain` metric from prometheus
272+
// to more accurately gauge if a node is truly stuck deleting, and what pod is causing it
273+
drainDuration := time.Since(taint.TimeAdded.Time)
274+
if drainDuration > 10*time.Minute {
275+
return true, &drainDuration
276+
}
277+
278+
return false, nil
279+
}
280+
281+
// InvestigateNodes examines the provided nodes and returns recommended actions, if any are needed
282+
func (i *Investigation) InvestigateNodes(nodes []corev1.Node) {
283+
for _, node := range nodes {
284+
i.InvestigateNode(node)
285+
}
286+
}
287+
288+
// InvestigateNode examines a node and determines if further investigation is required
289+
func (i *Investigation) InvestigateNode(node corev1.Node) {
290+
roleLabel, found := nodeutil.GetRole(node)
291+
if !found {
292+
notes := fmt.Sprintf("no role label containing %q found for node", nodeutil.RoleLabelPrefix)
293+
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
294+
return
295+
} else if !strings.Contains(roleLabel, nodeutil.WorkerRoleSuffix) {
296+
notes := "non-worker node affected"
297+
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
298+
return
299+
}
300+
301+
ready, found := nodeutil.FindReadyCondition(node)
302+
if !found {
303+
notes := "found no 'Ready' .Status.Condition for the node"
304+
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
305+
return
306+
}
307+
308+
if ready.Status != corev1.ConditionTrue {
309+
lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time).Truncate(time.Second)
310+
notes := fmt.Sprintf("node has been %q for %s", ready.Status, lastCheckinElapsed)
311+
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
312+
}
313+
}
314+
315+
func (i *Investigation) Name() string {
316+
return alertname
317+
}
318+
319+
func (i *Investigation) Description() string {
320+
return fmt.Sprintf("Investigates '%s' alerts", alertname)
321+
}
322+
323+
func (i *Investigation) IsExperimental() bool {
324+
return true
325+
}
326+
327+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
328+
return strings.Contains(alert, alertname)
329+
}

0 commit comments

Comments
 (0)