Skip to content

Commit 53d07dc

Browse files
Merge pull request #380 from RaphaelBut/add-insights-inv
Add insights operator down investigation
2 parents 61b96ec + f6e58ba commit 53d07dc

File tree

7 files changed

+246
-0
lines changed

7 files changed

+246
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
package insightsoperatordown
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
16+
"k8s.io/apimachinery/pkg/fields"
17+
"sigs.k8s.io/controller-runtime/pkg/client"
18+
)
19+
20+
type Investigation struct{}
21+
22+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
23+
result := investigation.InvestigationResult{}
24+
notes := notewriter.New(r.Name, logging.RawLogger)
25+
26+
user, err := ocm.GetCreatorFromCluster(r.OcmClient.GetConnection(), r.Cluster)
27+
if err != nil {
28+
notes.AppendWarning("encountered an issue when checking if the cluster owner is banned: %s", err)
29+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
30+
}
31+
32+
if user.Banned() {
33+
notes.AppendWarning("User is banned: %s\nBan description: %s\nPlease open a proactive case, so that MCS can resolve the ban or organize a ownership transfer.", user.BanCode(), user.BanDescription())
34+
} else {
35+
notes.AppendSuccess("User is not banned.")
36+
}
37+
38+
// We continue with the next step OCPBUG22226 even if the user is banned.
39+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, r.Name)
40+
if err != nil {
41+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
42+
}
43+
defer func() {
44+
deferErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, r.Name)
45+
if deferErr != nil {
46+
logging.Error(deferErr)
47+
err = errors.Join(err, deferErr)
48+
}
49+
}()
50+
51+
coList := &configv1.ClusterOperatorList{}
52+
listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "insights"})}
53+
err = k8scli.List(context.TODO(), coList, listOptions)
54+
if err != nil {
55+
return result, fmt.Errorf("unable to list insights clusteroperator: %w", err)
56+
}
57+
58+
if len(coList.Items) != 1 {
59+
return result, fmt.Errorf("found %d clusteroperators, expected 1", len(coList.Items))
60+
}
61+
co := coList.Items[0]
62+
63+
if isOCPBUG22226(&co) {
64+
notes.AppendWarning("Found symptom of OCPBUGS-22226. Try deleting the insights operator pod to remediate.\n$ oc -n openshift-insights delete pods -l app=insights-operator --wait=false")
65+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
66+
} else {
67+
notes.AppendSuccess("Ruled out OCPBUGS-22226")
68+
}
69+
70+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
71+
if err != nil {
72+
logging.Error("Network verifier ran into an error: %s", err.Error())
73+
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
74+
75+
err = r.PdClient.AddNote(notes.String())
76+
if err != nil {
77+
// We do not return as we want the alert to be escalated either no matter what.
78+
logging.Error("could not add failure reason incident notes")
79+
}
80+
}
81+
82+
switch verifierResult {
83+
case networkverifier.Failure:
84+
logging.Infof("Network verifier reported failure: %s", failureReason)
85+
// XXX: metrics.Inc(metrics.ServicelogPrepared, investigationName)
86+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
87+
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
88+
89+
// In the future, we want to send a service log in this case
90+
err = r.PdClient.AddNote(notes.String())
91+
if err != nil {
92+
logging.Error("could not add issues to incident notes")
93+
}
94+
case networkverifier.Success:
95+
notes.AppendSuccess("Network verifier passed")
96+
err = r.PdClient.AddNote(notes.String())
97+
if err != nil {
98+
logging.Error("could not add passed message to incident notes")
99+
}
100+
}
101+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
102+
}
103+
104+
func isOCPBUG22226(co *configv1.ClusterOperator) bool {
105+
symptomStatusString := "Failed to pull SCA certs"
106+
107+
for _, condition := range co.Status.Conditions {
108+
if condition.Type == "SCAAvailable" && strings.Contains(condition.Message, symptomStatusString) {
109+
return true
110+
}
111+
}
112+
return false
113+
}
114+
115+
func (c *Investigation) Name() string {
116+
return "insightsoperatordown"
117+
}
118+
119+
func (c *Investigation) Description() string {
120+
return "Investigate insights operator down alert"
121+
}
122+
123+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
124+
return strings.Contains(alert, "InsightsOperatorDown")
125+
}
126+
127+
func (c *Investigation) IsExperimental() bool {
128+
return false
129+
}
130+
131+
func (c *Investigation) RequiresAwsClient() bool {
132+
return false
133+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package insightsoperatordown
2+
3+
import (
4+
"testing"
5+
6+
configv1 "github.com/openshift/api/config/v1"
7+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
8+
)
9+
10+
func TestIsOCPBUG22226(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
co configv1.ClusterOperator
14+
expected bool
15+
}{
16+
{
17+
name: "SCA certs pull failure detected",
18+
co: configv1.ClusterOperator{
19+
ObjectMeta: v1.ObjectMeta{Name: "insights"},
20+
Status: configv1.ClusterOperatorStatus{
21+
Conditions: []configv1.ClusterOperatorStatusCondition{
22+
{Type: "SCAAvailable", Message: "Failed to pull SCA certs"},
23+
},
24+
},
25+
},
26+
expected: true,
27+
},
28+
{
29+
name: "No SCA certs pull failure",
30+
co: configv1.ClusterOperator{
31+
ObjectMeta: v1.ObjectMeta{Name: "insights"},
32+
Status: configv1.ClusterOperatorStatus{
33+
Conditions: []configv1.ClusterOperatorStatusCondition{
34+
{Type: "SCAAvailable", Message: "All systems operational"},
35+
},
36+
},
37+
},
38+
expected: false,
39+
},
40+
}
41+
42+
for _, tt := range tests {
43+
t.Run(tt.name, func(t *testing.T) {
44+
if isOCPBUG22226(&tt.co) != tt.expected {
45+
t.Fatalf("expected %v, got %v", tt.expected, !tt.expected)
46+
}
47+
})
48+
}
49+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Testing InsightsOperatorDownSRE
2+
3+
# OCPBUGS-22226
4+
5+
We can induce the symptom of `Failed to pull SCA certs` on a stage cluster by blocking `https://api.stage.openshift.com`
6+
The provided script creates a Rule Group and associates it with your clusters VPC.
7+
Requires awscli and backplane
8+
9+
```
10+
./pkg/investigations/insightsoperatordown/testing/block-api-openshift.sh <cluster-id>
11+
```
12+
13+
# Banned user
14+
15+
TODO
16+
17+
# Additional Resources
18+
19+
- SOP Link https://github.com/openshift/ops-sop/blob/master/v4/troubleshoot/clusteroperators/insights.md
20+
- Alert Definition https://github.com/openshift/managed-cluster-config/blob/master/deploy/sre-prometheus/insights/100-sre-insightsoperator.PrometheusRule.yaml
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
set -eox pipefail
3+
AWS_PAGER=""
4+
$(ocm backplane cloud credentials -o env $1)
5+
AWS_REGION=$(ocm describe cluster $1 --json | jq -r '.region.id')
6+
FW_RULE_GROUP_ID=$(aws route53resolver create-firewall-rule-group --name "api stage openshift com" | jq -r '.FirewallRuleGroup.Id')
7+
FW_DOMAIN_LIST_ID=$(aws route53resolver create-firewall-domain-list --name "api stage openshift com" | jq -r '.FirewallDomainList.Id')
8+
aws route53resolver update-firewall-domains --firewall-domain-list-id $FW_DOMAIN_LIST_ID --domains "api.stage.openshift.com" --operation "ADD"
9+
aws route53resolver create-firewall-rule --firewall-rule-group-id $FW_RULE_GROUP_ID --firewall-domain-list-id $FW_DOMAIN_LIST_ID --priority "1" --action "BLOCK" --block-response "NODATA" --name "api stage openshift com"
10+
INFRA_ID=$(ocm describe cluster $1 --json | jq -r '.infra_id')
11+
VPC_ID=$(aws ec2 describe-vpcs --filters "Name=tag-key,Values=kubernetes.io/cluster/$INFRA_ID" | jq -r '.Vpcs[0].VpcId')
12+
aws route53resolver associate-firewall-rule-group --firewall-rule-group-id $FW_RULE_GROUP_ID --name "rgassoc-$VPC_ID-$FW_RULE_GROUP_ID" --priority "1001" --vpc-id $VPC_ID
13+

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
66
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
77
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd"
8+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/insightsoperatordown"
89
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
910
)
1011

@@ -14,6 +15,7 @@ var availableInvestigations = []investigation.Investigation{
1415
&chgm.Investiation{},
1516
&clustermonitoringerrorbudgetburn.Investigation{},
1617
&cpd.Investigation{},
18+
&insightsoperatordown.Investigation{},
1719
}
1820

1921
// GetInvestigation returns the first Investigation that applies to the given alert title.

pkg/ocm/ocm.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
sdk "github.com/openshift-online/ocm-sdk-go"
1414

15+
amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1"
1516
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
1617
servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
1718
awsv1alpha1 "github.com/openshift/aws-account-operator/api/v1alpha1"
@@ -308,3 +309,30 @@ func (c *SdkClient) IsAccessProtected(cluster *cmv1.Cluster) (bool, error) {
308309
}
309310
return enabled, nil
310311
}
312+
313+
func GetCreatorFromCluster(ocmConn *sdk.Connection, cluster *cmv1.Cluster) (*amv1.Account, error) {
314+
logging.Debugf("Getting subscription from cluster: %s", cluster.ID())
315+
cmv1Subscription, ok := cluster.GetSubscription()
316+
if !ok {
317+
return nil, fmt.Errorf("failed to get subscription from cluster: %s", cluster.ID())
318+
}
319+
subscriptionResponse, err := ocmConn.AccountsMgmt().V1().Subscriptions().Subscription(cmv1Subscription.ID()).Get().Send()
320+
if err != nil {
321+
return nil, err
322+
}
323+
324+
subscription, ok := subscriptionResponse.GetBody()
325+
if !ok {
326+
return nil, errors.New("failed to get subscription")
327+
}
328+
329+
if status := subscription.Status(); status != "Active" {
330+
return nil, fmt.Errorf("Expecting status 'Active' found %v\n", status)
331+
}
332+
333+
creator, ok := subscription.GetCreator()
334+
if !ok {
335+
return nil, errors.New("failed to get creator from subscription")
336+
}
337+
return creator, nil
338+
}

test/generate_incident.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ declare -A alert_mapping=(
77
["ClusterHasGoneMissing"]="cadtest has gone missing"
88
["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
10+
["InsightsOperatorDown"]="InsightsOperatorDown"
1011
)
1112

1213
# Function to print help message

0 commit comments

Comments
 (0)