Skip to content

Commit 02d2c44

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 53d07dc commit 02d2c44

File tree

7 files changed

+352
-1
lines changed

7 files changed

+352
-1
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and posting some cluster version errors.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package cannotretrieveupdatesre
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"k8s.io/apimachinery/pkg/fields"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
)
18+
19+
const (
20+
alertname = "CannotRetrieveUpdatesSRE"
21+
remediationName = "cannotretrieveupdatesre"
22+
)
23+
24+
type Investigation struct {
25+
kclient client.Client
26+
notes *notewriter.NoteWriter
27+
}
28+
29+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
30+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
31+
result := investigation.InvestigationResult{}
32+
33+
// Setup
34+
err := i.setup(r)
35+
if err != nil {
36+
return result, fmt.Errorf("failed to setup investigation: %w", err)
37+
}
38+
39+
defer func(r *investigation.Resources) {
40+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
41+
if cleanupErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName); cleanupErr != nil {
42+
logging.Errorf("Failed to cleanup Kubernetes client: %v", cleanupErr)
43+
} else {
44+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
45+
}
46+
}(r)
47+
48+
if err := i.checkClusterValidity(r); err != nil {
49+
logging.Errorf("Cluster validation failed: %v", err)
50+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
51+
}
52+
53+
if err := i.runNetworkVerifier(r, &result); err != nil {
54+
logging.Errorf("Network verification failed: %v", err)
55+
}
56+
57+
if err := i.checkClusterVersion(r); err != nil {
58+
logging.Errorf("ClusterVersion check failed: %v", err)
59+
}
60+
61+
i.notes.AppendWarning("Alert escalated to on-call primary for review.")
62+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
63+
err = r.PdClient.EscalateIncidentWithNote(i.notes.String())
64+
if err != nil {
65+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
66+
return result, fmt.Errorf("failed to escalate incident: %w", err)
67+
}
68+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
69+
70+
return result, nil
71+
}
72+
73+
func (i *Investigation) checkClusterValidity(r *investigation.Resources) error {
74+
if r.Cluster == nil || r.Cluster.ID() == "" {
75+
errMsg := "invalid cluster configuration: cluster or cluster ID is missing"
76+
i.notes.AppendWarning(errMsg)
77+
return errors.New(errMsg)
78+
}
79+
return nil
80+
}
81+
82+
func (i *Investigation) runNetworkVerifier(r *investigation.Resources, result *investigation.InvestigationResult) error {
83+
logging.Infof("Running network verification for cluster %s", r.Cluster.ID())
84+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
85+
if err != nil {
86+
i.notes.AppendWarning("Network verifier encountered an error: %v", err)
87+
return fmt.Errorf("network verifier failed: %w", err)
88+
}
89+
90+
logging.Infof("Network verification completed with result: %v", verifierResult)
91+
switch verifierResult {
92+
case networkverifier.Success:
93+
i.notes.AppendSuccess("Network verifier passed")
94+
case networkverifier.Failure:
95+
logging.Infof("Network verifier reported failure: %s", failureReason)
96+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
97+
i.notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s",
98+
r.Cluster.ID(), failureReason)
99+
return errors.New("network verification failed: " + failureReason)
100+
}
101+
return nil
102+
}
103+
104+
func (i *Investigation) checkClusterVersion(r *investigation.Resources) error {
105+
logging.Infof("Checking ClusterVersion for cluster %s", r.Cluster.ID())
106+
cvList := &configv1.ClusterVersionList{}
107+
listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "version"})}
108+
err := i.kclient.List(context.TODO(), cvList, listOptions)
109+
if err != nil {
110+
i.notes.AppendWarning("Failed to list ClusterVersion: %v\nThis may indicate cluster access issues", err)
111+
return fmt.Errorf("failed to list ClusterVersion: %w", err)
112+
}
113+
if len(cvList.Items) != 1 {
114+
errMsg := fmt.Sprintf("found %d ClusterVersions, expected 1", len(cvList.Items))
115+
logging.Warnf(errMsg)
116+
i.notes.AppendWarning(errMsg)
117+
return errors.New(errMsg)
118+
}
119+
120+
versionCv := cvList.Items[0]
121+
logging.Infof("ClusterVersion found: %s", versionCv.Status.Desired.Version)
122+
for _, condition := range versionCv.Status.Conditions {
123+
logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%s",
124+
condition.Type, condition.Status, condition.Reason, condition.Message)
125+
if condition.Type == "RetrievedUpdates" &&
126+
condition.Status == "False" &&
127+
condition.Reason == "VersionNotFound" &&
128+
strings.Contains(condition.Message, "Unable to retrieve available updates") {
129+
i.notes.AppendWarning("ClusterVersion error detected: %s\nThis indicates the current version %s is not found in the specified channel %s",
130+
condition.Message, versionCv.Status.Desired.Version, versionCv.Spec.Channel)
131+
return errors.New("clusterversion validation failed: VersionNotFound")
132+
}
133+
}
134+
fmt.Printf("Cluster version: %s\n", versionCv.Status.Desired.Version)
135+
return nil
136+
}
137+
138+
// setup initializes the investigation resources
139+
func (i *Investigation) setup(r *investigation.Resources) error {
140+
logging.Infof("Setting up investigation '%s' for cluster %s with remediation name %s",
141+
i.Name(), r.Cluster.ID(), r.Name)
142+
143+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
144+
if err != nil {
145+
logging.Errorf("Failed to initialize Kubernetes client: %v", err)
146+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
147+
}
148+
i.kclient = k
149+
i.notes = notewriter.New(r.Name, logging.RawLogger)
150+
151+
logging.Infof("Successfully set up Kubernetes client and notewriter for remediation %s", r.Name)
152+
return nil
153+
}
154+
155+
func (i *Investigation) Name() string {
156+
return alertname
157+
}
158+
159+
func (i *Investigation) Description() string {
160+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
161+
}
162+
163+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
164+
return strings.Contains(alert, alertname)
165+
}
166+
167+
func (i *Investigation) IsExperimental() bool {
168+
return true
169+
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
package cannotretrieveupdatesre
2+
3+
import (
4+
. "github.com/onsi/ginkgo/v2"
5+
. "github.com/onsi/gomega"
6+
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
7+
configv1 "github.com/openshift/api/config/v1"
8+
awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
9+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
11+
pdmock "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty/mock"
12+
hivev1 "github.com/openshift/hive/apis/hive/v1"
13+
"go.uber.org/mock/gomock"
14+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/runtime"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
18+
)
19+
20+
var _ = Describe("CannotRetrieveUpdatesSRE Investigation", func() {
21+
var (
22+
mockCtrl *gomock.Controller
23+
clusterBuilder *cmv1.ClusterBuilder
24+
cluster *cmv1.Cluster
25+
clusterDeployment *hivev1.ClusterDeployment
26+
pdClient *pdmock.MockClient
27+
awsCli *awsmock.MockClient
28+
fakeClient client.Client
29+
scheme *runtime.Scheme
30+
inv *Investigation
31+
resources *investigation.Resources
32+
)
33+
34+
BeforeEach(func() {
35+
logging.InitLogger("fatal", "")
36+
37+
mockCtrl = gomock.NewController(GinkgoT())
38+
pdClient = pdmock.NewMockClient(mockCtrl)
39+
awsCli = awsmock.NewMockClient(mockCtrl)
40+
41+
clusterBuilder = cmv1.NewCluster().ID("test-cluster")
42+
var err error
43+
cluster, err = clusterBuilder.Build()
44+
Expect(err).ToNot(HaveOccurred())
45+
46+
clusterDeployment = &hivev1.ClusterDeployment{
47+
Spec: hivev1.ClusterDeploymentSpec{
48+
ClusterMetadata: &hivev1.ClusterMetadata{
49+
InfraID: "infra_id",
50+
},
51+
},
52+
}
53+
54+
scheme = runtime.NewScheme()
55+
Expect(configv1.AddToScheme(scheme)).To(Succeed())
56+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).Build()
57+
58+
inv = &Investigation{
59+
kclient: fakeClient,
60+
}
61+
resources = &investigation.Resources{
62+
Cluster: cluster,
63+
ClusterDeployment: clusterDeployment,
64+
PdClient: pdClient,
65+
AwsClient: awsCli,
66+
Name: remediationName,
67+
}
68+
})
69+
70+
AfterEach(func() {
71+
mockCtrl.Finish()
72+
})
73+
74+
Describe("Run Method", func() {
75+
When("ClusterVersion has VersionNotFound condition", func() {
76+
It("Should detect the condition and escalate with appropriate notes", func() {
77+
cv := &configv1.ClusterVersion{
78+
ObjectMeta: v1.ObjectMeta{Name: "version"},
79+
Spec: configv1.ClusterVersionSpec{Channel: "stable-4.18"},
80+
Status: configv1.ClusterVersionStatus{
81+
Desired: configv1.Release{Version: "4.18.5"},
82+
Conditions: []configv1.ClusterOperatorStatusCondition{
83+
{
84+
Type: "RetrievedUpdates",
85+
Status: "False",
86+
Reason: "VersionNotFound",
87+
Message: "Unable to retrieve available updates: version 4.18.5 not found",
88+
},
89+
},
90+
},
91+
}
92+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithObjects(cv).Build()
93+
inv.kclient = fakeClient
94+
95+
// Arrange
96+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
97+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
98+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
99+
Expect(note).To(ContainSubstring("Network verifier passed"))
100+
Expect(note).To(ContainSubstring("ClusterVersion error detected: Unable to retrieve available updates: version 4.18.5 not found"))
101+
Expect(note).To(ContainSubstring("This indicates the current version 4.18.5 is not found in the specified channel stable-4.18"))
102+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
103+
return nil
104+
})
105+
106+
// Act
107+
result, err := inv.Run(resources)
108+
109+
// Assert
110+
Expect(err).ToNot(HaveOccurred())
111+
Expect(result.ServiceLogPrepared.Performed).To(BeFalse())
112+
})
113+
})
114+
115+
When("Network verifier fails", func() {
116+
It("Should prepare a service log and escalate", func() {
117+
// Arrange
118+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
119+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
120+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
121+
Expect(note).To(ContainSubstring("NetworkVerifier found unreachable targets"))
122+
Expect(note).To(ContainSubstring("osdctl servicelog post test-cluster"))
123+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
124+
return nil
125+
})
126+
127+
// Act
128+
result, err := inv.Run(resources)
129+
130+
// Assert
131+
Expect(err).ToNot(HaveOccurred())
132+
Expect(result.ServiceLogPrepared.Performed).To(BeTrue())
133+
})
134+
})
135+
136+
When("Kubernetes client fails to list ClusterVersion", func() {
137+
It("Should escalate with a warning note", func() {
138+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects().Build()
139+
inv.kclient = fakeClient
140+
141+
// Arrange
142+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
143+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
144+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
145+
Expect(note).To(ContainSubstring("Network verifier passed"))
146+
Expect(note).To(ContainSubstring("Failed to list ClusterVersion"))
147+
Expect(note).To(ContainSubstring("This may indicate cluster access issues"))
148+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
149+
return nil
150+
})
151+
152+
// Act
153+
result, err := inv.Run(resources)
154+
155+
// Assert
156+
Expect(err).ToNot(HaveOccurred())
157+
Expect(result.ServiceLogPrepared.Performed).To(BeFalse())
158+
})
159+
})
160+
})
161+
})
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package investigations
22

33
import (
4+
cannotretrieveupdatesre "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatesre"
45
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{
1617
&clustermonitoringerrorbudgetburn.Investigation{},
1718
&cpd.Investigation{},
1819
&insightsoperatordown.Investigation{},
20+
&cannotretrieveupdatesre.Investigation{},
1921
}
2022

2123
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
set -e
33

44
# Define the mapping of alert names to titles
@@ -8,6 +8,7 @@ declare -A alert_mapping=(
88
["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
11+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1112
)
1213

1314
# Function to print help message

0 commit comments

Comments
 (0)