|
| 1 | +/* |
| 2 | +machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert |
| 3 | +*/ |
| 4 | +package machinehealthcheckunterminatedshortcircuitsre |
| 5 | + |
| 6 | +import ( |
| 7 | + "context" |
| 8 | + "errors" |
| 9 | + "fmt" |
| 10 | + "slices" |
| 11 | + "strings" |
| 12 | + "time" |
| 13 | + |
| 14 | + machinev1beta1 "github.com/openshift/api/machine/v1beta1" |
| 15 | + |
| 16 | + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" |
| 17 | + machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine" |
| 18 | + nodeutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node" |
| 19 | + k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" |
| 20 | + "github.com/openshift/configuration-anomaly-detection/pkg/logging" |
| 21 | + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" |
| 22 | + |
| 23 | + corev1 "k8s.io/api/core/v1" |
| 24 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 25 | +) |
| 26 | + |
| 27 | +const ( |
| 28 | + alertname = "MachineHealthCheckUnterminatedShortCircuitSRE" |
| 29 | + // remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api |
| 30 | + remediationName = "machineHealthCheckUnterminatedShortCircuitSRE" |
| 31 | +) |
| 32 | + |
| 33 | +type Investigation struct { |
| 34 | + // kclient provides access to on-cluster resources |
| 35 | + kclient client.Client |
| 36 | + // notes holds the messages that will be shared with Primary upon completion |
| 37 | + notes *notewriter.NoteWriter |
| 38 | + // recommendations holds the set of actions CAD recommends primary to take |
| 39 | + recommendations investigationRecommendations |
| 40 | +} |
| 41 | + |
| 42 | +func (i *Investigation) setup(r *investigation.Resources) error { |
| 43 | + // Setup investigation |
| 44 | + k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) |
| 45 | + if err != nil { |
| 46 | + return fmt.Errorf("failed to initialize kubernetes client: %w", err) |
| 47 | + } |
| 48 | + i.kclient = k |
| 49 | + i.notes = notewriter.New(r.Name, logging.RawLogger) |
| 50 | + i.recommendations = investigationRecommendations{} |
| 51 | + |
| 52 | + return nil |
| 53 | +} |
| 54 | + |
| 55 | +// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert |
| 56 | +// |
| 57 | +// The investigation seeks to provide exactly one recommended action per affected machine/node pair. |
| 58 | +// The machine object is evaluated first, as it represents a lower-level object that could affect the health of the node. |
| 59 | +// If the investigation determines that the breakage is occurring at the machine-level, the corresponding node is *not* investigated. |
| 60 | +// After investigating all affected machines, potentially affected nodes are investigated. |
| 61 | +func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { |
| 62 | + ctx := context.Background() |
| 63 | + result := investigation.InvestigationResult{} |
| 64 | + |
| 65 | + // Setup & teardown |
| 66 | + err := i.setup(r) |
| 67 | + if err != nil { |
| 68 | + return result, fmt.Errorf("failed to setup investigation: %w", err) |
| 69 | + } |
| 70 | + defer func(r *investigation.Resources) { |
| 71 | + err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName) |
| 72 | + if err != nil { |
| 73 | + logging.Errorf("failed to cleanup investigation: %w", err) |
| 74 | + } |
| 75 | + }(r) |
| 76 | + |
| 77 | + targetMachines, err := i.getMachinesFromFailingMHC(ctx) |
| 78 | + if err != nil { |
| 79 | + i.notes.AppendWarning("failed to retrieve one or more target machines: %v", err) |
| 80 | + } |
| 81 | + if len(targetMachines) == 0 { |
| 82 | + i.notes.AppendWarning("no machines found for short-circuited machinehealthcheck objects") |
| 83 | + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) |
| 84 | + } |
| 85 | + |
| 86 | + problemMachines, err := i.InvestigateMachines(ctx, targetMachines) |
| 87 | + if err != nil { |
| 88 | + i.notes.AppendWarning("failed to investigate machines: %v", err) |
| 89 | + } |
| 90 | + |
| 91 | + // Trim out the machines that we've already investigated and know have problems |
| 92 | + // |
| 93 | + // The purpose of this is to avoid re-investigating nodes whose machines were already investigated. Any node-level issue on a failing machine |
| 94 | + // is most likely related to the machine itself, and providing duplicate/conflicting advice will only prove confusing to the responder |
| 95 | + targetMachines = slices.DeleteFunc(targetMachines, func(targetMachine machinev1beta1.Machine) bool { |
| 96 | + for _, problemMachine := range problemMachines { |
| 97 | + if problemMachine.Name == targetMachine.Name { |
| 98 | + return true |
| 99 | + } |
| 100 | + } |
| 101 | + return false |
| 102 | + }) |
| 103 | + |
| 104 | + // If one or more machines managed by the machinehealthcheck have not yet been identified as a problem, check on the machine's |
| 105 | + // node to determine if there are node-level problems that need remediating |
| 106 | + if len(targetMachines) > 0 { |
| 107 | + targetNodes, err := machineutil.GetNodesForMachines(ctx, i.kclient, targetMachines) |
| 108 | + if err != nil { |
| 109 | + i.notes.AppendWarning("failed to retrieve one or more target nodes: %v", err) |
| 110 | + } |
| 111 | + if len(targetNodes) > 0 { |
| 112 | + i.InvestigateNodes(targetNodes) |
| 113 | + } |
| 114 | + } |
| 115 | + |
| 116 | + // Summarize recommendations from investigation in PD notes, if any found |
| 117 | + if len(i.recommendations) > 0 { |
| 118 | + i.notes.AppendWarning(i.recommendations.summarize()) |
| 119 | + } else { |
| 120 | + i.notes.AppendSuccess("no recommended actions to take against cluster") |
| 121 | + } |
| 122 | + |
| 123 | + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) |
| 124 | +} |
| 125 | + |
| 126 | +func (i *Investigation) getMachinesFromFailingMHC(ctx context.Context) ([]machinev1beta1.Machine, error) { |
| 127 | + healthchecks := machinev1beta1.MachineHealthCheckList{} |
| 128 | + err := i.kclient.List(ctx, &healthchecks, &client.ListOptions{Namespace: machineutil.MachineNamespace}) |
| 129 | + if err != nil { |
| 130 | + return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machinehealthchecks from cluster: %w", err) |
| 131 | + } |
| 132 | + |
| 133 | + targets := []machinev1beta1.Machine{} |
| 134 | + for _, healthcheck := range healthchecks.Items { |
| 135 | + if !machineutil.HealthcheckRemediationAllowed(healthcheck) { |
| 136 | + machines, err := machineutil.GetMachinesForMHC(ctx, i.kclient, healthcheck) |
| 137 | + if err != nil { |
| 138 | + i.notes.AppendWarning("failed to retrieve machines from machinehealthcheck %q: %v", healthcheck.Name, err) |
| 139 | + continue |
| 140 | + } |
| 141 | + targets = append(targets, machines...) |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + return targets, nil |
| 146 | +} |
| 147 | + |
| 148 | +// InvestigateMachines evaluates the state of the machines in the cluster and returns a list of the failing machines, along with a categorized set of recommendations based on the failure state of |
| 149 | +// each machine |
| 150 | +func (i *Investigation) InvestigateMachines(ctx context.Context, targets []machinev1beta1.Machine) ([]machinev1beta1.Machine, error) { |
| 151 | + investigatedMachines := []machinev1beta1.Machine{} |
| 152 | + var errs error |
| 153 | + for _, machine := range targets { |
| 154 | + if machine.DeletionTimestamp != nil { |
| 155 | + err := i.investigateDeletingMachine(ctx, machine) |
| 156 | + investigatedMachines = append(investigatedMachines, machine) |
| 157 | + if err != nil { |
| 158 | + errs = errors.Join(errs, fmt.Errorf("failed to investigate deleting machine: %w", err)) |
| 159 | + } |
| 160 | + continue |
| 161 | + } |
| 162 | + |
| 163 | + if (machine.Status.Phase != nil && *machine.Status.Phase == machinev1beta1.PhaseFailed) || machine.Status.ErrorReason != nil { |
| 164 | + err := i.investigateFailingMachine(machine) |
| 165 | + investigatedMachines = append(investigatedMachines, machine) |
| 166 | + if err != nil { |
| 167 | + errs = errors.Join(errs, fmt.Errorf("failed to investigate failing machine: %w", err)) |
| 168 | + } |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + if len(investigatedMachines) == 0 { |
| 173 | + i.notes.AppendSuccess("no deleting or Failed machines found") |
| 174 | + } |
| 175 | + return investigatedMachines, errs |
| 176 | +} |
| 177 | + |
| 178 | +// investigateFailingMachin evaluates a machine whose .Status.Phase is failed, and provides a recommendation based on the cause of the failure |
| 179 | +func (i *Investigation) investigateFailingMachine(machine machinev1beta1.Machine) error { |
| 180 | + role, err := machineutil.GetRole(machine) |
| 181 | + if err != nil { |
| 182 | + // Failing to determine whether a machine is Red Hat-managed warrants manual investigation |
| 183 | + notes := fmt.Sprintf("unable to determine machine role: %v", err) |
| 184 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 185 | + return fmt.Errorf("failed to determine role for machine %q: %w", machine.Name, err) |
| 186 | + } |
| 187 | + |
| 188 | + // Safely dereference status fields to avoid panics in cases where machines' statuses haven't been fully populated |
| 189 | + var errorMsg string |
| 190 | + if machine.Status.ErrorMessage != nil { |
| 191 | + errorMsg = *machine.Status.ErrorMessage |
| 192 | + } |
| 193 | + |
| 194 | + var errorReason machinev1beta1.MachineStatusError |
| 195 | + if machine.Status.ErrorReason != nil { |
| 196 | + errorReason = *machine.Status.ErrorReason |
| 197 | + } |
| 198 | + |
| 199 | + if role != machineutil.WorkerRoleLabelValue { |
| 200 | + // If machine is Red-Hat owned, always require manual investigation |
| 201 | + notes := fmt.Sprintf("Red Hat-owned machine in state %q due to %q", errorReason, errorMsg) |
| 202 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 203 | + return nil |
| 204 | + } |
| 205 | + |
| 206 | + switch errorReason { |
| 207 | + case machinev1beta1.IPAddressInvalidReason: |
| 208 | + notes := fmt.Sprintf("invalid IP address: %q. Deleting machine may allow the cloud provider to assign a valid IP address", errorMsg) |
| 209 | + i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes) |
| 210 | + |
| 211 | + case machinev1beta1.CreateMachineError: |
| 212 | + notes := fmt.Sprintf("machine failed to create: %q. Deleting machine may resolve any transient issues with the cloud provider", errorMsg) |
| 213 | + i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes) |
| 214 | + |
| 215 | + case machinev1beta1.InvalidConfigurationMachineError: |
| 216 | + notes := fmt.Sprintf("the machine configuration is invalid: %q. Checking splunk audit logs may indicate whether the customer has modified the machine or its machineset", errorMsg) |
| 217 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 218 | + |
| 219 | + case machinev1beta1.DeleteMachineError: |
| 220 | + notes := fmt.Sprintf("the machine's node could not be gracefully terminated automatically: %q", errorMsg) |
| 221 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 222 | + |
| 223 | + case machinev1beta1.InsufficientResourcesMachineError: |
| 224 | + notes := fmt.Sprintf("a servicelog should be sent because there is insufficient quota to provision the machine: %q", errorMsg) |
| 225 | + i.recommendations.addRecommendation(recommendationQuotaServiceLog, machine.Name, notes) |
| 226 | + |
| 227 | + default: |
| 228 | + notes := "no .Status.ErrorReason found for machine" |
| 229 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 230 | + } |
| 231 | + return nil |
| 232 | +} |
| 233 | + |
| 234 | +// InvestigateDeletingMachines evaluates machines which are being deleted, to determine if & why they are blocked, along with a recommendation on how to unblock them |
| 235 | +func (i *Investigation) investigateDeletingMachine(ctx context.Context, machine machinev1beta1.Machine) error { |
| 236 | + if machine.Status.NodeRef == nil { |
| 237 | + notes := "machine stuck deleting with no node" |
| 238 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 239 | + return nil |
| 240 | + } |
| 241 | + node, err := machineutil.GetNodeForMachine(ctx, i.kclient, machine) |
| 242 | + if err != nil { |
| 243 | + notes := "machine's node could not be determined" |
| 244 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 245 | + return fmt.Errorf("failed to retrieve node for machine %q: %w", machine.Name, err) |
| 246 | + } |
| 247 | + |
| 248 | + stuck, duration := checkForStuckDrain(node) |
| 249 | + if stuck { |
| 250 | + notes := fmt.Sprintf("node %q found to be stuck draining for %s", node.Name, duration.Truncate(time.Second).String()) |
| 251 | + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) |
| 252 | + return nil |
| 253 | + } |
| 254 | + |
| 255 | + notes := "unable to determine why machine is stuck deleting" |
| 256 | + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) |
| 257 | + return nil |
| 258 | +} |
| 259 | + |
| 260 | +// checkForStuckDrain makes a best-effort approximation at whether a node is stuck draining a specific pod |
| 261 | +func checkForStuckDrain(node corev1.Node) (bool, *time.Duration) { |
| 262 | + if len(node.Spec.Taints) == 0 { |
| 263 | + return false, nil |
| 264 | + } |
| 265 | + |
| 266 | + taint, found := nodeutil.FindNoScheduleTaint(node) |
| 267 | + if !found { |
| 268 | + return false, nil |
| 269 | + } |
| 270 | + |
| 271 | + // TODO - Once CAD can access on-cluster metrics, we can query the `pods_preventing_node_drain` metric from prometheus |
| 272 | + // to more accurately gauge if a node is truly stuck deleting, and what pod is causing it |
| 273 | + drainDuration := time.Since(taint.TimeAdded.Time) |
| 274 | + if drainDuration > 10*time.Minute { |
| 275 | + return true, &drainDuration |
| 276 | + } |
| 277 | + |
| 278 | + return false, nil |
| 279 | +} |
| 280 | + |
| 281 | +// InvestigateNodes examines the provided nodes and returns recommended actions, if any are needed |
| 282 | +func (i *Investigation) InvestigateNodes(nodes []corev1.Node) { |
| 283 | + for _, node := range nodes { |
| 284 | + i.InvestigateNode(node) |
| 285 | + } |
| 286 | +} |
| 287 | + |
| 288 | +// InvestigateNode examines a node and determines if further investigation is required |
| 289 | +func (i *Investigation) InvestigateNode(node corev1.Node) { |
| 290 | + roleLabel, found := nodeutil.GetRole(node) |
| 291 | + if !found { |
| 292 | + notes := fmt.Sprintf("no role label containing %q found for node", nodeutil.RoleLabelPrefix) |
| 293 | + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) |
| 294 | + return |
| 295 | + } else if !strings.Contains(roleLabel, nodeutil.WorkerRoleSuffix) { |
| 296 | + notes := "non-worker node affected" |
| 297 | + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) |
| 298 | + return |
| 299 | + } |
| 300 | + |
| 301 | + ready, found := nodeutil.FindReadyCondition(node) |
| 302 | + if !found { |
| 303 | + notes := "found no 'Ready' .Status.Condition for the node" |
| 304 | + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) |
| 305 | + return |
| 306 | + } |
| 307 | + |
| 308 | + if ready.Status != corev1.ConditionTrue { |
| 309 | + lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time).Truncate(time.Second) |
| 310 | + notes := fmt.Sprintf("node has been %q for %s", ready.Status, lastCheckinElapsed) |
| 311 | + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) |
| 312 | + } |
| 313 | +} |
| 314 | + |
| 315 | +func (i *Investigation) Name() string { |
| 316 | + return alertname |
| 317 | +} |
| 318 | + |
| 319 | +func (i *Investigation) Description() string { |
| 320 | + return fmt.Sprintf("Investigates '%s' alerts", alertname) |
| 321 | +} |
| 322 | + |
| 323 | +func (i *Investigation) IsExperimental() bool { |
| 324 | + return true |
| 325 | +} |
| 326 | + |
| 327 | +func (i *Investigation) ShouldInvestigateAlert(alert string) bool { |
| 328 | + return strings.Contains(alert, alertname) |
| 329 | +} |
0 commit comments