Skip to content

Commit bcb9b3c

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 53d07dc commit bcb9b3c

File tree

6 files changed

+348
-1
lines changed

6 files changed

+348
-1
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package CannotRetrieveUpdatesSRE
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
8+
configv1 "github.com/openshift/api/config/v1"
9+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
11+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
14+
"k8s.io/apimachinery/pkg/fields"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct {
24+
kclient client.Client
25+
notes *notewriter.NoteWriter
26+
}
27+
28+
// setup initializes the investigation resources
29+
func (i *Investigation) setup(r *investigation.Resources) error {
30+
logging.Infof("Setting up investigation '%s' for cluster %s with remediation name %s",
31+
i.Name(), r.Cluster.ID(), r.Name)
32+
33+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
34+
if err != nil {
35+
logging.Errorf("Failed to initialize Kubernetes client: %v", err)
36+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
37+
}
38+
i.kclient = k
39+
i.notes = notewriter.New(r.Name, logging.RawLogger)
40+
41+
logging.Infof("Successfully set up Kubernetes client and notewriter for remediation %s", r.Name)
42+
return nil
43+
}
44+
45+
// cleanup handles resource cleanup after investigation
46+
func (i *Investigation) cleanup(r *investigation.Resources) error {
47+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
48+
err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
49+
if err != nil {
50+
logging.Errorf("Failed to cleanup Kubernetes client: %v", err)
51+
return fmt.Errorf("failed to cleanup kubernetes client: %w", err)
52+
}
53+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
54+
return nil
55+
}
56+
57+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
58+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
59+
result := investigation.InvestigationResult{}
60+
61+
// Setup & teardown
62+
err := i.setup(r)
63+
if err != nil {
64+
return result, fmt.Errorf("failed to setup investigation: %w", err)
65+
}
66+
defer func(r *investigation.Resources) {
67+
if err := i.cleanup(r); err != nil {
68+
logging.Errorf("Failed to cleanup investigation: %v", err)
69+
}
70+
}(r)
71+
72+
if r.Cluster == nil || r.Cluster.ID() == "" {
73+
errMsg := "Invalid cluster configuration: cluster or cluster ID is missing"
74+
logging.Errorf(errMsg)
75+
i.notes.AppendWarning(errMsg)
76+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
77+
}
78+
79+
// Run network verification
80+
logging.Infof("Running network verification for cluster %s", r.Cluster.ID())
81+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
82+
if err != nil {
83+
logging.Errorf("Network verifier failed: %v", err)
84+
i.notes.AppendWarning("Network verifier encountered an error: %v", err)
85+
} else {
86+
logging.Infof("Network verification completed with result: %v", verifierResult)
87+
switch verifierResult {
88+
case networkverifier.Success:
89+
i.notes.AppendSuccess("Network verifier passed")
90+
case networkverifier.Failure:
91+
logging.Infof("Network verifier reported failure: %s", failureReason)
92+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
93+
i.notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s",
94+
r.Cluster.ID(), failureReason)
95+
}
96+
}
97+
98+
// Check ClusterVersion
99+
logging.Infof("Checking ClusterVersion for cluster %s", r.Cluster.ID())
100+
cvList := &configv1.ClusterVersionList{}
101+
listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "version"})}
102+
err = i.kclient.List(context.TODO(), cvList, listOptions)
103+
switch {
104+
case err != nil:
105+
logging.Errorf("Failed to list ClusterVersion: %v", err)
106+
i.notes.AppendWarning("Failed to list ClusterVersion: %v\nThis may indicate cluster access issues", err)
107+
case len(cvList.Items) != 1:
108+
logging.Warnf("Found %d ClusterVersions, expected 1", len(cvList.Items))
109+
i.notes.AppendWarning("Found %d ClusterVersions, expected 1", len(cvList.Items))
110+
default:
111+
versionCv := cvList.Items[0]
112+
logging.Infof("ClusterVersion found: %s", versionCv.Status.Desired.Version)
113+
for _, condition := range versionCv.Status.Conditions {
114+
logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%s",
115+
condition.Type, condition.Status, condition.Reason, condition.Message)
116+
if condition.Type == "RetrievedUpdates" &&
117+
condition.Status == "False" &&
118+
condition.Reason == "VersionNotFound" &&
119+
strings.Contains(condition.Message, "Unable to retrieve available updates") {
120+
i.notes.AppendWarning("ClusterVersion error detected: %s\nThis indicates the current version %s is not found in the specified channel %s",
121+
condition.Message, versionCv.Status.Desired.Version, versionCv.Spec.Channel)
122+
}
123+
}
124+
fmt.Printf("Cluster version: %s\n", versionCv.Status.Desired.Version)
125+
}
126+
127+
i.notes.AppendWarning("Alert escalated to on-call primary for review.")
128+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
129+
err = r.PdClient.EscalateIncidentWithNote(i.notes.String())
130+
if err != nil {
131+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
132+
return result, fmt.Errorf("failed to escalate incident: %w", err)
133+
}
134+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
135+
136+
return result, nil
137+
}
138+
139+
func (i *Investigation) Name() string {
140+
return alertname
141+
}
142+
143+
func (i *Investigation) Description() string {
144+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
145+
}
146+
147+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
148+
return strings.Contains(alert, alertname)
149+
}
150+
151+
func (i *Investigation) IsExperimental() bool {
152+
return true
153+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package CannotRetrieveUpdatesSRE
2+
3+
import (
4+
"context"
5+
"errors"
6+
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10+
configv1 "github.com/openshift/api/config/v1"
11+
awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
12+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
14+
pdmock "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty/mock"
15+
hivev1 "github.com/openshift/hive/apis/hive/v1"
16+
"go.uber.org/mock/gomock"
17+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/runtime"
19+
"sigs.k8s.io/controller-runtime/pkg/client"
20+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
21+
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
22+
)
23+
24+
var _ = Describe("CannotRetrieveUpdatesSRE Investigation", func() {
25+
var (
26+
mockCtrl *gomock.Controller
27+
clusterBuilder *cmv1.ClusterBuilder
28+
cluster *cmv1.Cluster
29+
clusterDeployment *hivev1.ClusterDeployment
30+
pdClient *pdmock.MockClient
31+
awsCli *awsmock.MockClient
32+
fakeClient client.Client
33+
scheme *runtime.Scheme
34+
inv *Investigation
35+
resources *investigation.Resources
36+
)
37+
38+
BeforeEach(func() {
39+
logging.InitLogger("fatal", "")
40+
41+
mockCtrl = gomock.NewController(GinkgoT())
42+
pdClient = pdmock.NewMockClient(mockCtrl)
43+
awsCli = awsmock.NewMockClient(mockCtrl)
44+
45+
// Setup cluster
46+
clusterBuilder = cmv1.NewCluster().ID("test-cluster")
47+
var err error
48+
cluster, err = clusterBuilder.Build()
49+
Expect(err).ToNot(HaveOccurred())
50+
51+
// Setup cluster deployment
52+
clusterDeployment = &hivev1.ClusterDeployment{
53+
Spec: hivev1.ClusterDeploymentSpec{
54+
ClusterMetadata: &hivev1.ClusterMetadata{
55+
InfraID: "infra_id",
56+
},
57+
},
58+
}
59+
60+
// Setup fake Kubernetes client
61+
scheme = runtime.NewScheme()
62+
Expect(configv1.AddToScheme(scheme)).To(Succeed())
63+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).Build()
64+
65+
inv = &Investigation{
66+
kclient: fakeClient,
67+
}
68+
resources = &investigation.Resources{
69+
Cluster: cluster,
70+
ClusterDeployment: clusterDeployment,
71+
PdClient: pdClient,
72+
AwsClient: awsCli,
73+
Name: remediationName,
74+
}
75+
})
76+
77+
AfterEach(func() {
78+
mockCtrl.Finish()
79+
})
80+
81+
Describe("Run Method", func() {
82+
When("ClusterVersion has VersionNotFound condition", func() {
83+
It("Should detect the condition and escalate with appropriate notes", func() {
84+
// Setup ClusterVersion with VersionNotFound
85+
cv := &configv1.ClusterVersion{
86+
ObjectMeta: v1.ObjectMeta{Name: "version"},
87+
Spec: configv1.ClusterVersionSpec{Channel: "stable-4.18"},
88+
Status: configv1.ClusterVersionStatus{
89+
Desired: configv1.Release{Version: "4.18.5"},
90+
Conditions: []configv1.ClusterOperatorStatusCondition{
91+
{
92+
Type: "RetrievedUpdates",
93+
Status: "False",
94+
Reason: "VersionNotFound",
95+
Message: "Unable to retrieve available updates: version 4.18.5 not found",
96+
},
97+
},
98+
},
99+
}
100+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithObjects(cv).Build()
101+
inv.kclient = fakeClient
102+
103+
// Arrange
104+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
105+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
106+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
107+
Expect(note).To(ContainSubstring("Network verifier passed"))
108+
Expect(note).To(ContainSubstring("ClusterVersion error detected: Unable to retrieve available updates: version 4.18.5 not found"))
109+
Expect(note).To(ContainSubstring("This indicates the current version 4.18.5 is not found in the specified channel stable-4.18"))
110+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
111+
return nil
112+
})
113+
114+
// Act
115+
result, err := inv.Run(resources)
116+
117+
// Assert
118+
Expect(err).ToNot(HaveOccurred())
119+
Expect(result.ServiceLogPrepared.Performed).To(BeFalse())
120+
})
121+
})
122+
123+
When("Network verifier fails", func() {
124+
It("Should prepare a service log and escalate", func() {
125+
// Arrange
126+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
127+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
128+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
129+
Expect(note).To(ContainSubstring("NetworkVerifier found unreachable targets"))
130+
Expect(note).To(ContainSubstring("osdctl servicelog post test-cluster"))
131+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
132+
return nil
133+
})
134+
135+
// Act
136+
result, err := inv.Run(resources)
137+
138+
// Assert
139+
Expect(err).ToNot(HaveOccurred())
140+
Expect(result.ServiceLogPrepared.Performed).To(BeTrue())
141+
})
142+
})
143+
144+
When("Kubernetes client fails to list ClusterVersion", func() {
145+
It("Should escalate with a warning note", func() {
146+
// Setup failing Kubernetes client with interceptor
147+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithInterceptorFuncs(interceptor.Funcs{
148+
List: func(ctx context.Context, client client.WithWatch, list client.ObjectList, opts ...client.ListOption) error {
149+
return errors.New("mock list error")
150+
},
151+
}).Build()
152+
inv.kclient = fakeClient
153+
154+
// Arrange
155+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
156+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
157+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
158+
Expect(note).To(ContainSubstring("Network verifier passed"))
159+
Expect(note).To(ContainSubstring("Failed to list ClusterVersion: mock list error"))
160+
Expect(note).To(ContainSubstring("This may indicate cluster access issues"))
161+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
162+
return nil
163+
})
164+
165+
// Act
166+
_, err := inv.Run(resources)
167+
168+
// Assert
169+
Expect(err).ToNot(HaveOccurred())
170+
})
171+
})
172+
})
173+
})
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and posting some cluster version errors.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package investigations
22

33
import (
4+
CannotRetrieveUpdatesSRE "github.com/openshift/configuration-anomaly-detection/pkg/investigations/CannotRetrieveUpdatesSRE"
45
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{
1617
&clustermonitoringerrorbudgetburn.Investigation{},
1718
&cpd.Investigation{},
1819
&insightsoperatordown.Investigation{},
20+
&CannotRetrieveUpdatesSRE.Investigation{},
1921
}
2022

2123
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/bin/zsh
22
set -e
33

44
# Define the mapping of alert names to titles
@@ -8,6 +8,7 @@ declare -A alert_mapping=(
88
["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
11+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1112
)
1213

1314
# Function to print help message

0 commit comments

Comments
 (0)