Skip to content

Commit 9524efe

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 53d07dc commit 9524efe

File tree

5 files changed

+169
-1
lines changed

5 files changed

+169
-1
lines changed
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
package CannotRetrieveUpdatesSRE
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
8+
configv1 "github.com/openshift/api/config/v1"
9+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
11+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
14+
"k8s.io/apimachinery/pkg/fields"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct {
24+
kclient client.Client
25+
notes *notewriter.NoteWriter
26+
}
27+
28+
// setup initializes the investigation resources
29+
func (i *Investigation) setup(r *investigation.Resources) error {
30+
logging.Infof("Setting up investigation '%s' for cluster %s with remediation name %s",
31+
i.Name(), r.Cluster.ID(), r.Name)
32+
33+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
34+
if err != nil {
35+
logging.Errorf("Failed to initialize Kubernetes client: %v", err)
36+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
37+
}
38+
i.kclient = k
39+
i.notes = notewriter.New(r.Name, logging.RawLogger)
40+
41+
logging.Infof("Successfully set up Kubernetes client and notewriter for remediation %s", r.Name)
42+
return nil
43+
}
44+
45+
// cleanup handles resource cleanup after investigation
46+
func (i *Investigation) cleanup(r *investigation.Resources) error {
47+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
48+
err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
49+
if err != nil {
50+
logging.Errorf("Failed to cleanup Kubernetes client: %v", err)
51+
return fmt.Errorf("failed to cleanup kubernetes client: %w", err)
52+
}
53+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
54+
return nil
55+
}
56+
57+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
58+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
59+
result := investigation.InvestigationResult{}
60+
61+
// Setup & teardown
62+
err := i.setup(r)
63+
if err != nil {
64+
return result, fmt.Errorf("failed to setup investigation: %w", err)
65+
}
66+
defer func(r *investigation.Resources) {
67+
if err := i.cleanup(r); err != nil {
68+
logging.Errorf("Failed to cleanup investigation: %v", err)
69+
}
70+
}(r)
71+
72+
if r.Cluster == nil || r.Cluster.ID() == "" {
73+
errMsg := "Invalid cluster configuration: cluster or cluster ID is missing"
74+
logging.Errorf(errMsg)
75+
i.notes.AppendWarning(errMsg)
76+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
77+
}
78+
79+
// Run network verification
80+
logging.Infof("Running network verification for cluster %s", r.Cluster.ID())
81+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
82+
if err != nil {
83+
logging.Errorf("Network verifier failed: %v", err)
84+
i.notes.AppendWarning("Network verifier encountered an error: %v", err)
85+
} else {
86+
logging.Infof("Network verification completed with result: %v", verifierResult)
87+
switch verifierResult {
88+
case networkverifier.Success:
89+
i.notes.AppendSuccess("Network verifier passed")
90+
case networkverifier.Failure:
91+
logging.Infof("Network verifier reported failure: %s", failureReason)
92+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
93+
i.notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s",
94+
r.Cluster.ID(), failureReason)
95+
}
96+
}
97+
98+
// Check ClusterVersion
99+
logging.Infof("Checking ClusterVersion for cluster %s", r.Cluster.ID())
100+
cvList := &configv1.ClusterVersionList{}
101+
listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "version"})}
102+
err = i.kclient.List(context.TODO(), cvList, listOptions)
103+
if err != nil {
104+
logging.Errorf("Failed to list ClusterVersion: %v", err)
105+
i.notes.AppendWarning("Failed to list ClusterVersion: %v\nThis may indicate cluster access issues", err)
106+
} else if len(cvList.Items) != 1 {
107+
logging.Warnf("Found %d ClusterVersions, expected 1", len(cvList.Items))
108+
i.notes.AppendWarning("Found %d ClusterVersions, expected 1", len(cvList.Items))
109+
} else {
110+
versionCv := cvList.Items[0]
111+
logging.Infof("ClusterVersion found: %s", versionCv.Status.Desired.Version)
112+
for _, condition := range versionCv.Status.Conditions {
113+
logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%s",
114+
condition.Type, condition.Status, condition.Reason, condition.Message)
115+
if condition.Type == "RetrievedUpdates" &&
116+
condition.Status == "False" &&
117+
condition.Reason == "VersionNotFound" &&
118+
strings.Contains(condition.Message, "Unable to retrieve available updates") {
119+
i.notes.AppendWarning("ClusterVersion error detected: %s\nThis indicates the current version %s is not found in the specified channel %s",
120+
condition.Message, versionCv.Status.Desired.Version, versionCv.Spec.Channel)
121+
}
122+
}
123+
fmt.Printf("Cluster version: %s\n", versionCv.Status.Desired.Version)
124+
}
125+
126+
i.notes.AppendWarning("Alert escalated to on-call primary for review.")
127+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
128+
err = r.PdClient.EscalateIncidentWithNote(i.notes.String())
129+
if err != nil {
130+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
131+
return result, fmt.Errorf("failed to escalate incident: %w", err)
132+
}
133+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
134+
135+
return result, nil
136+
}
137+
138+
func (i *Investigation) Name() string {
139+
return alertname
140+
}
141+
142+
func (i *Investigation) Description() string {
143+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
144+
}
145+
146+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
147+
return strings.Contains(alert, alertname)
148+
}
149+
150+
func (i *Investigation) IsExperimental() bool {
151+
return true
152+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier
4+
5+
## Testing
6+
7+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
8+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package investigations
22

33
import (
4+
CannotRetrieveUpdatesSRE "github.com/openshift/configuration-anomaly-detection/pkg/investigations/CannotRetrieveUpdatesSRE"
45
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{
1617
&clustermonitoringerrorbudgetburn.Investigation{},
1718
&cpd.Investigation{},
1819
&insightsoperatordown.Investigation{},
20+
&CannotRetrieveUpdatesSRE.Investigation{},
1921
}
2022

2123
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/bin/zsh
22
set -e
33

44
# Define the mapping of alert names to titles
@@ -8,6 +8,7 @@ declare -A alert_mapping=(
88
["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
11+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1112
)
1213

1314
# Function to print help message

0 commit comments

Comments
 (0)