Skip to content

Commit ac7a009

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 19709d6 commit ac7a009

File tree

9 files changed

+322
-0
lines changed

9 files changed

+322
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Binary file not shown.
Binary file not shown.
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct{}
24+
25+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
26+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
27+
result := investigation.InvestigationResult{}
28+
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
29+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
30+
if err != nil {
31+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
32+
}
33+
defer func() {
34+
deferErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
35+
if deferErr != nil {
36+
logging.Error(deferErr)
37+
err = errors.Join(err, deferErr)
38+
}
39+
}()
40+
41+
defer func(r *investigation.Resources) {
42+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
43+
if cleanupErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName); cleanupErr != nil {
44+
logging.Errorf("Failed to cleanup Kubernetes client: %v", cleanupErr)
45+
} else {
46+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
47+
}
48+
}(r)
49+
50+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
51+
if err != nil {
52+
logging.Error("Network verifier ran into an error: %s", err.Error())
53+
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
54+
55+
err = r.PdClient.AddNote(notes.String())
56+
if err != nil {
57+
// We do not return as we want the alert to be escalated either no matter what.
58+
logging.Error("could not add failure reason incident notes")
59+
}
60+
}
61+
62+
switch verifierResult {
63+
case networkverifier.Failure:
64+
logging.Infof("Network verifier reported failure: %s", failureReason)
65+
// XXX: metrics.Inc(metrics.ServicelogPrepared, investigationName)
66+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
67+
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
68+
69+
// In the future, we want to send a service log in this case
70+
err = r.PdClient.AddNote(notes.String())
71+
if err != nil {
72+
logging.Error("could not add issues to incident notes")
73+
}
74+
case networkverifier.Success:
75+
notes.AppendSuccess("Network verifier passed")
76+
err = r.PdClient.AddNote(notes.String())
77+
if err != nil {
78+
logging.Error("could not add passed message to incident notes")
79+
}
80+
}
81+
82+
// Check ClusterVersion
83+
clusterVersion, note, err := checkClusterVersion(k8scli, r.Cluster.ID())
84+
if err != nil {
85+
notes.AppendWarning("Failure checking ClusterVersion: %s", err.Error())
86+
notes.AppendWarning("Alert escalated to on-call primary for review.")
87+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
88+
err = r.PdClient.EscalateIncidentWithNote(notes.String())
89+
if err != nil {
90+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
91+
return result, fmt.Errorf("failed to escalate incident: %w", err)
92+
}
93+
return result, err
94+
}
95+
if note != "" {
96+
notes.AppendWarning(note)
97+
err = r.PdClient.AddNote(notes.String())
98+
if err != nil {
99+
logging.Error("could not add notes to the incident")
100+
}
101+
}
102+
if clusterVersion != "" {
103+
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion)
104+
err = r.PdClient.AddNote(notes.String())
105+
if err != nil {
106+
logging.Error("could not add passed message to incident notes")
107+
}
108+
}
109+
110+
notes.AppendWarning("Alert escalated to on-call primary for review.")
111+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
112+
err = r.PdClient.EscalateIncidentWithNote(notes.String())
113+
if err != nil {
114+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
115+
return result, fmt.Errorf("failed to escalate incident: %w", err)
116+
}
117+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
118+
119+
return result, nil
120+
}
121+
122+
// checkClusterVersion retrieves the cluster version
123+
func checkClusterVersion(k8scli client.Client, clusterID string) (version string, note string, err error) {
124+
logging.Infof("Checking ClusterVersion for cluster %s", clusterID)
125+
clusterVersion := &configv1.ClusterVersion{}
126+
err = k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
127+
if err != nil {
128+
return "", "Failed to get ClusterVersion: cluster access issues detected", fmt.Errorf("failed to get ClusterVersion: %w", err)
129+
}
130+
logging.Infof("ClusterVersion channel: %s", clusterVersion.Spec.Channel)
131+
logging.Infof("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)
132+
logging.Debugf("ClusterVersion conditions: %+v", clusterVersion.Status.Conditions)
133+
134+
for _, condition := range clusterVersion.Status.Conditions {
135+
logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%q",
136+
condition.Type, condition.Status, condition.Reason, condition.Message)
137+
if condition.Type == "RetrievedUpdates" && condition.Status == "False" {
138+
if (condition.Reason == "VersionNotFound" || condition.Reason == "RemoteFailed") &&
139+
strings.Contains(strings.TrimSpace(condition.Message), "Unable to retrieve available updates") {
140+
logging.Warnf("Detected ClusterVersion error: Reason=%s, Message=%s", condition.Reason, condition.Message)
141+
return "", fmt.Sprintf("ClusterVersion error detected: %s. Current version %s not found in channel %s",
142+
condition.Message, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel),
143+
fmt.Errorf("clusterversion validation failed: %s", condition.Reason)
144+
}
145+
}
146+
}
147+
return clusterVersion.Status.Desired.Version, "", nil
148+
}
149+
150+
func (i *Investigation) Name() string {
151+
return alertname
152+
}
153+
154+
func (i *Investigation) Description() string {
155+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
156+
}
157+
158+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
159+
return strings.Contains(alert, alertname)
160+
}
161+
162+
func (i *Investigation) IsExperimental() bool {
163+
return true
164+
}
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
configv1 "github.com/openshift/api/config/v1"
8+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
"k8s.io/client-go/kubernetes/scheme"
10+
"sigs.k8s.io/controller-runtime/pkg/client"
11+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
12+
)
13+
14+
func newFakeClient(objs ...client.Object) (client.Client, error) {
15+
s := scheme.Scheme
16+
err := configv1.AddToScheme(s)
17+
if err != nil {
18+
return nil, err
19+
}
20+
21+
client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build()
22+
return client, nil
23+
}
24+
25+
func TestCheckClusterVersion(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
clusterVersion *configv1.ClusterVersion
29+
expectedVersion string
30+
expectError bool
31+
expectedNote string
32+
}{
33+
{
34+
name: "RemoteFailed condition",
35+
clusterVersion: &configv1.ClusterVersion{
36+
ObjectMeta: metav1.ObjectMeta{
37+
Name: "version",
38+
},
39+
Spec: configv1.ClusterVersionSpec{
40+
Channel: "stable-4.18-test",
41+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
42+
},
43+
Status: configv1.ClusterVersionStatus{
44+
Desired: configv1.Release{Version: "4.18.10"},
45+
Conditions: []configv1.ClusterOperatorStatusCondition{
46+
{
47+
Type: "RetrievedUpdates",
48+
Status: "False",
49+
Reason: "RemoteFailed",
50+
Message: `Unable to retrieve available updates:`,
51+
},
52+
},
53+
},
54+
},
55+
expectedVersion: "",
56+
expectError: true,
57+
expectedNote: `ClusterVersion error detected: Unable to retrieve available updates`,
58+
},
59+
{
60+
name: "VersionNotFound condition",
61+
clusterVersion: &configv1.ClusterVersion{
62+
ObjectMeta: metav1.ObjectMeta{
63+
Name: "version",
64+
},
65+
Spec: configv1.ClusterVersionSpec{
66+
Channel: "stable-4.18-test",
67+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
68+
},
69+
Status: configv1.ClusterVersionStatus{
70+
Desired: configv1.Release{Version: "4.18.10"},
71+
Conditions: []configv1.ClusterOperatorStatusCondition{
72+
{
73+
Type: "RetrievedUpdates",
74+
Status: "False",
75+
Reason: "VersionNotFound",
76+
Message: `Unable to retrieve available updates: version 4.18.10 not found in channel stable-4.18-test`,
77+
},
78+
},
79+
},
80+
},
81+
expectedVersion: "",
82+
expectError: true,
83+
expectedNote: `ClusterVersion error detected: Unable to retrieve available updates: version 4.18.10 not found in channel stable-4.18-test.`,
84+
},
85+
{
86+
name: "Happy path",
87+
clusterVersion: &configv1.ClusterVersion{
88+
ObjectMeta: metav1.ObjectMeta{
89+
Name: "version",
90+
},
91+
Spec: configv1.ClusterVersionSpec{
92+
Channel: "stable-4.18",
93+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
94+
},
95+
Status: configv1.ClusterVersionStatus{
96+
Desired: configv1.Release{Version: "4.18.10"},
97+
Conditions: []configv1.ClusterOperatorStatusCondition{
98+
{
99+
Type: "RetrievedUpdates",
100+
Status: "True",
101+
Reason: "UpdatesRetrieved",
102+
Message: "Available updates retrieved successfully",
103+
},
104+
},
105+
},
106+
},
107+
expectedVersion: "4.18.10",
108+
expectError: false,
109+
expectedNote: "",
110+
},
111+
}
112+
113+
for _, tt := range tests {
114+
t.Run(tt.name, func(t *testing.T) {
115+
k8scli, err := newFakeClient(tt.clusterVersion)
116+
if err != nil {
117+
t.Fatalf("failed to create a fake client: %v", err)
118+
}
119+
version, note, err := checkClusterVersion(k8scli, "test-cluster")
120+
121+
if version != tt.expectedVersion {
122+
if !strings.Contains(note, tt.expectedNote) {
123+
t.Errorf("Expected note message: %s. Got %s", tt.expectedNote, note)
124+
}
125+
if !tt.expectError {
126+
t.Errorf("Expected version %s, got %s", tt.expectedVersion, version)
127+
}
128+
}
129+
130+
if tt.expectError && err == nil {
131+
t.Errorf("Expected an error, got none")
132+
} else if !tt.expectError && err != nil {
133+
t.Errorf("Expected no error, got %v", err)
134+
}
135+
})
136+
}
137+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package investigations
22

33
import (
4+
cannotretrieveupdatessre "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
45
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -20,6 +21,7 @@ var availableInvestigations = []investigation.Investigation{
2021
&insightsoperatordown.Investigation{},
2122
&upgradeconfigsyncfailureover4hr.Investigation{},
2223
&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
24+
&cannotretrieveupdatessre.Investigation{},
2325
}
2426

2527
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ declare -A alert_mapping=(
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
1111
["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
12+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1213
)
1314

1415
# Function to print help message

0 commit comments

Comments
 (0)