Skip to content

Commit 685883f

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 6ab773b commit 685883f

File tree

7 files changed

+296
-0
lines changed

7 files changed

+296
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct{}
24+
25+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
26+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
27+
result := investigation.InvestigationResult{}
28+
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
29+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
30+
if err != nil {
31+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
32+
}
33+
defer func() {
34+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
35+
deferErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
36+
if deferErr != nil {
37+
logging.Error(deferErr)
38+
err = errors.Join(err, deferErr)
39+
} else {
40+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
41+
}
42+
}()
43+
44+
// Run network verifier
45+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
46+
if err != nil {
47+
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
48+
} else {
49+
switch verifierResult {
50+
case networkverifier.Failure:
51+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
52+
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
53+
case networkverifier.Success:
54+
notes.AppendSuccess("Network verifier passed")
55+
}
56+
}
57+
58+
// Check ClusterVersion
59+
clusterVersion, note, err := checkClusterVersion(k8scli, r.Cluster.ID())
60+
if err != nil {
61+
notes.AppendWarning("Failure checking ClusterVersion: %s", err.Error())
62+
if note != "" {
63+
notes.AppendWarning(note)
64+
}
65+
} else {
66+
if note != "" {
67+
notes.AppendWarning(note)
68+
}
69+
if clusterVersion != "" {
70+
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion)
71+
}
72+
}
73+
74+
notes.AppendWarning("Alert escalated to on-call primary for review.")
75+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
76+
err = r.PdClient.EscalateIncidentWithNote(notes.String())
77+
if err != nil {
78+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
79+
return result, fmt.Errorf("failed to escalate incident: %w", err)
80+
}
81+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
82+
83+
return result, nil
84+
}
85+
86+
// checkClusterVersion retrieves the cluster version
87+
func checkClusterVersion(k8scli client.Client, clusterID string) (version string, note string, err error) {
88+
logging.Infof("Checking ClusterVersion for cluster %s", clusterID)
89+
clusterVersion := &configv1.ClusterVersion{}
90+
err = k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
91+
if err != nil {
92+
return "", "Failed to get ClusterVersion: cluster access issues detected", fmt.Errorf("failed to get ClusterVersion: %w", err)
93+
}
94+
logging.Infof("ClusterVersion channel: %s", clusterVersion.Spec.Channel)
95+
logging.Infof("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)
96+
97+
for _, condition := range clusterVersion.Status.Conditions {
98+
if condition.Type == "RetrievedUpdates" {
99+
note, err = checkRetrievedUpdatesCondition(condition, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel)
100+
if err != nil {
101+
return "", note, err
102+
}
103+
return clusterVersion.Status.Desired.Version, note, nil
104+
}
105+
}
106+
return clusterVersion.Status.Desired.Version, "", nil
107+
}
108+
109+
func checkRetrievedUpdatesCondition(condition configv1.ClusterOperatorStatusCondition, currentVersion, channel string) (string, error) {
110+
if condition.Status == configv1.ConditionFalse {
111+
if (condition.Reason == "VersionNotFound" || condition.Reason == "RemoteFailed") &&
112+
strings.Contains(strings.TrimSpace(condition.Message), "Unable to retrieve available updates") {
113+
logging.Warnf("Detected ClusterVersion issue: Reason=%s, Message=%s", condition.Reason, condition.Message)
114+
note := fmt.Sprintf("ClusterVersion issue detected: %s. Current version %s not found in channel %s",
115+
condition.Message, currentVersion, channel)
116+
return note, fmt.Errorf("clusterversion has undesirable state: %s", condition.Reason)
117+
}
118+
}
119+
return "", nil
120+
}
121+
122+
func (i *Investigation) Name() string {
123+
return alertname
124+
}
125+
126+
func (i *Investigation) Description() string {
127+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
128+
}
129+
130+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
131+
return strings.Contains(alert, alertname)
132+
}
133+
134+
func (i *Investigation) IsExperimental() bool {
135+
return true
136+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
configv1 "github.com/openshift/api/config/v1"
8+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
"k8s.io/client-go/kubernetes/scheme"
10+
"sigs.k8s.io/controller-runtime/pkg/client"
11+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
12+
)
13+
14+
func newFakeClient(objs ...client.Object) (client.Client, error) {
15+
s := scheme.Scheme
16+
err := configv1.AddToScheme(s)
17+
if err != nil {
18+
return nil, err
19+
}
20+
21+
client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build()
22+
return client, nil
23+
}
24+
25+
func TestCheckClusterVersion(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
clusterVersion *configv1.ClusterVersion
29+
expectedVersion string
30+
expectError bool
31+
expectedNote string
32+
}{
33+
{
34+
name: "RemoteFailed condition",
35+
clusterVersion: &configv1.ClusterVersion{
36+
ObjectMeta: metav1.ObjectMeta{
37+
Name: "version",
38+
},
39+
Spec: configv1.ClusterVersionSpec{
40+
Channel: "stable-4.18-test",
41+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
42+
},
43+
Status: configv1.ClusterVersionStatus{
44+
Desired: configv1.Release{Version: "4.18.10"},
45+
Conditions: []configv1.ClusterOperatorStatusCondition{
46+
{
47+
Type: "RetrievedUpdates",
48+
Status: "False",
49+
Reason: "RemoteFailed",
50+
Message: "Unable to retrieve available updates",
51+
},
52+
},
53+
},
54+
},
55+
expectedVersion: "",
56+
expectError: true,
57+
expectedNote: "ClusterVersion issue detected: Unable to retrieve available updates",
58+
},
59+
{
60+
name: "VersionNotFound condition",
61+
clusterVersion: &configv1.ClusterVersion{
62+
ObjectMeta: metav1.ObjectMeta{
63+
Name: "version",
64+
},
65+
Spec: configv1.ClusterVersionSpec{
66+
Channel: "stable-4.18-test",
67+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
68+
},
69+
Status: configv1.ClusterVersionStatus{
70+
Desired: configv1.Release{Version: "4.18.10"},
71+
Conditions: []configv1.ClusterOperatorStatusCondition{
72+
{
73+
Type: "RetrievedUpdates",
74+
Status: "False",
75+
Reason: "VersionNotFound",
76+
Message: "Unable to retrieve available updates: version 4.18.10 not found in channel stable-4.18-test",
77+
},
78+
},
79+
},
80+
},
81+
expectedVersion: "",
82+
expectError: true,
83+
expectedNote: "ClusterVersion issue detected: Unable to retrieve available updates: version 4.18.10 not found in channel stable-4.18-test.",
84+
},
85+
{
86+
name: "Happy path",
87+
clusterVersion: &configv1.ClusterVersion{
88+
ObjectMeta: metav1.ObjectMeta{
89+
Name: "version",
90+
},
91+
Spec: configv1.ClusterVersionSpec{
92+
Channel: "stable-4.18",
93+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
94+
},
95+
Status: configv1.ClusterVersionStatus{
96+
Desired: configv1.Release{Version: "4.18.10"},
97+
Conditions: []configv1.ClusterOperatorStatusCondition{
98+
{
99+
Type: "RetrievedUpdates",
100+
Status: "True",
101+
Reason: "UpdatesRetrieved",
102+
Message: "Available updates retrieved successfully",
103+
},
104+
},
105+
},
106+
},
107+
expectedVersion: "4.18.10",
108+
expectError: false,
109+
expectedNote: "",
110+
},
111+
}
112+
113+
for _, tt := range tests {
114+
t.Run(tt.name, func(t *testing.T) {
115+
k8scli, err := newFakeClient(tt.clusterVersion)
116+
if err != nil {
117+
t.Fatalf("failed to create a fake client: %v", err)
118+
}
119+
version, note, err := checkClusterVersion(k8scli, "test-cluster")
120+
121+
// Check version
122+
if version != tt.expectedVersion {
123+
t.Errorf("Expected version %q, got %q", tt.expectedVersion, version)
124+
}
125+
126+
// Check note
127+
if !strings.HasPrefix(note, tt.expectedNote) {
128+
t.Errorf("Expected note to start with %q, got %q", tt.expectedNote, note)
129+
}
130+
131+
// Check error
132+
if tt.expectError && err == nil {
133+
t.Errorf("Expected an error, got none")
134+
} else if !tt.expectError && err != nil {
135+
t.Errorf("Expected no error, got %v", err)
136+
}
137+
})
138+
}
139+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package investigations
22

33
import (
44
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn"
5+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
78
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -22,6 +23,7 @@ var availableInvestigations = []investigation.Investigation{
2223
&insightsoperatordown.Investigation{},
2324
&upgradeconfigsyncfailureover4hr.Investigation{},
2425
&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
26+
&cannotretrieveupdatessre.Investigation{},
2527
}
2628

2729
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ declare -A alert_mapping=(
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
1111
["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
1212
["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)"
13+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1314
)
1415

1516
# Function to print help message

0 commit comments

Comments
 (0)