Skip to content

Commit 7b0f2f0

Browse files
authored
Create CAD remediation for UpgradeConfigSyncFailureOver4HrSRE (#430)
* Initial working commit that checks if user is banned and gets pull secret from cluster * working changes. * Working code that compares OCM and cluster pull secrets. * working with test. * Added step by step in readme to integration test OCM vs Cluster pull-secret. * added better instruction * removed accidental new line * fixed error variable naming * Added better notes and cleanup uneeded lines. * fixed spacing for linter
1 parent 642e71b commit 7b0f2f0

File tree

5 files changed

+252
-0
lines changed

5 files changed

+252
-0
lines changed

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd"
88
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/insightsoperatordown"
99
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/upgradeconfigsyncfailureover4hr"
1011
)
1112

1213
// availableInvestigations holds all Investigation implementations.
@@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{
1617
&clustermonitoringerrorbudgetburn.Investigation{},
1718
&cpd.Investigation{},
1819
&insightsoperatordown.Investigation{},
20+
&upgradeconfigsyncfailureover4hr.Investigation{},
1921
}
2022

2123
// GetInvestigation returns the first Investigation that applies to the given alert title.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# upgradeconfigsyncfailureover4hr Investigation
2+
3+
Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation
4+
5+
### Integration test for Secret Key check
6+
In order to integration test the logic for checking the pull-secret in OCM vs the pull-secret on your cluster you'll need to do a few things.
7+
8+
1. Set up a cluster and test incident in pagerduty as you would for any CAD investigation test.
9+
2. Get the pull secret from the cluster and output it to a file.
10+
11+
`oc get secret pull-secret -ojson -n openshift-config --as backplane-cluster-admin > backup_pull_secret.json`
12+
3. Make a copy of the file you just created for easy backup. We'll be making edits later to the copied file.
13+
`cp backup_pull_secret.json broken_pull_secret.json
14+
4. Decrypt the .dockerconfigjson entry. The easiest way to do this is to copy the whole part in quotes to your clipboard, echo it in your terminal, and pipe it through `base64 -d` and save the output in a separate file.
15+
16+
`echo $copied value | base64 -d`
17+
5. Find the entry for registry.connect.redhat.com and copy the encrypted value for the auth entry. Exclude the quotes again. Repeat the process of de-encrypting this value using `base64 -d`
18+
19+
`echo $copied_value | base64 -d`
20+
6. Edit this value in a text editor and change the value after the colon. Leave the preceeding value before the colon as it is.
21+
7. Do the encryption process detailed above backwards. First you'll need to encrypt your new pull-secret.dockerconfigjson.registry.connect.redhat.com.auth value (the one we just changed). Simply echo it on your command line and pipe it into base64. Place the whole value in single quotes to avoid any text parsing issues.
22+
23+
`echo $changed_value | base64`
24+
8. Replace that value in the registry.connect.redhat.com.auth value in your decrypted .dockerconfigjson you saved in step 4 then base64 encrypt the whole thing. Take that encrypted value and replace the encrypted .dockerconfigjson value in your broken_pull_secret.json file.
25+
9. Apply the newly broken pull-secret json file to your cluster using oc apply.
26+
27+
`oc apply -f broken_pull_secret.json --as backplane-cluster-admin`
28+
10. Re run your test according to the CAD readme. This should return a warning in the logs `⚠️ Pull secret does not match on cluster and in OCM` and apply the same message to the pagerduty incident.
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Package upgradeconfigsyncfailureover4hr contains functionality for the UpgradeConfigSyncFailureOver4HrSRE investigation
2+
package upgradeconfigsyncfailureover4hr
3+
4+
import (
5+
"context"
6+
"encoding/base64"
7+
"errors"
8+
"fmt"
9+
"strings"
10+
11+
v1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
13+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
15+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
16+
ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
17+
corev1 "k8s.io/api/core/v1"
18+
"k8s.io/apimachinery/pkg/types"
19+
"sigs.k8s.io/controller-runtime/pkg/client"
20+
)
21+
22+
type Investigation struct{}
23+
24+
const (
25+
alertname = "UpgradeConfigSyncFailureOver4HrSRE"
26+
remediationName = "upgradeconfigsyncfailureover4hr"
27+
)
28+
29+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
30+
result := investigation.InvestigationResult{}
31+
notes := notewriter.New("UpgradeConfigSyncFailureOver4Hr", logging.RawLogger)
32+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
33+
if err != nil {
34+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
35+
}
36+
defer func() {
37+
deferErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
38+
if deferErr != nil {
39+
logging.Error(deferErr)
40+
err = errors.Join(err, deferErr)
41+
}
42+
}()
43+
logging.Infof("Checking if user is Banned.")
44+
userBannedStatus, userBannedNotes, err := ocm.CheckIfUserBanned(r.OcmClient, r.Cluster)
45+
if err != nil {
46+
notes.AppendWarning("encountered an issue when checking if the cluster owner is banned: %s\nPlease investigate.", err)
47+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
48+
}
49+
if userBannedStatus {
50+
notes.AppendWarning(userBannedNotes)
51+
} else {
52+
notes.AppendSuccess("User is not banned.")
53+
}
54+
user, err := ocm.GetCreatorFromCluster(r.OcmClient.GetConnection(), r.Cluster)
55+
logging.Infof("User ID is: %v", user.ID())
56+
clusterSecretToken, note, err := getClusterPullSecret(k8scli)
57+
if err != nil {
58+
notes.AppendWarning("Failre getting ClusterSecret: %s", err)
59+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
60+
}
61+
if note != "" {
62+
notes.AppendWarning(note)
63+
}
64+
registryCredential, err := ocm.GetOCMPullSecret(r.OcmClient.GetConnection(), user.ID())
65+
if err != nil {
66+
notes.AppendWarning("Error getting OCMPullSecret: %s", err)
67+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
68+
}
69+
if clusterSecretToken == registryCredential {
70+
notes.AppendSuccess("Pull Secret matches on cluster and in OCM. Please continue investigation.")
71+
} else {
72+
notes.AppendWarning("Pull secret does not match on cluster and in OCM.")
73+
}
74+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
75+
}
76+
77+
func getClusterPullSecret(k8scli client.Client) (secretToken string, note string, err error) {
78+
secret := &corev1.Secret{}
79+
err = k8scli.Get(context.TODO(), types.NamespacedName{
80+
Namespace: "openshift-config",
81+
Name: "pull-secret",
82+
}, secret)
83+
if err != nil {
84+
return "", "", err
85+
}
86+
if secret.Data == nil {
87+
return "", "Cluster pull secret Data is empty.", err
88+
}
89+
secretValue, exists := secret.Data[".dockerconfigjson"]
90+
if !exists {
91+
return "", "Cluster pull secret does not contain the necessary .dockerconfigjson", err
92+
}
93+
94+
dockerConfigJson, err := v1.UnmarshalAccessToken(secretValue)
95+
if err != nil {
96+
return "", "", err
97+
}
98+
_, exists = dockerConfigJson.Auths()["cloud.openshift.com"]
99+
if !exists {
100+
return "", "cloud.openshift.com value not found in clusterPullSecret. This means there is an issue with the pull secret on the cluster.", err
101+
}
102+
103+
value, err := base64.StdEncoding.DecodeString(dockerConfigJson.Auths()["registry.connect.redhat.com"].Auth())
104+
if err != nil {
105+
return "", "", err
106+
}
107+
_, splitValue, _ := strings.Cut(string(value), ":")
108+
return splitValue, "", nil
109+
}
110+
111+
func (c *Investigation) Name() string {
112+
return "UpgradeConfigSyncFailureOver4hr"
113+
}
114+
115+
func (c *Investigation) Description() string {
116+
return "Investigates the UpgradeConfigSyncFailureOver4hr alert"
117+
}
118+
119+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
120+
return strings.Contains(alert, "UpgradeConfigSyncFailureOver4HrSRE")
121+
}
122+
123+
func (c *Investigation) IsExperimental() bool {
124+
return false
125+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package upgradeconfigsyncfailureover4hr
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
corev1 "k8s.io/api/core/v1"
8+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
10+
)
11+
12+
func TestGetClusterPullSecret(t *testing.T) {
13+
tests := []struct {
14+
name string
15+
data string
16+
secretToken string
17+
expectError bool
18+
expectedNote string
19+
}{
20+
{
21+
name: "happy path",
22+
data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"[email protected]\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"[email protected]\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"[email protected]\"}}}",
23+
secretToken: "CorrectValue\n",
24+
expectError: false,
25+
},
26+
{
27+
name: "Value mismatch",
28+
data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"cloud.openshift.com\":{\"auth\":\"TestAuthValue\",\"email\":\"[email protected]\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"[email protected]\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"[email protected]\"}}}",
29+
secretToken: "IncorrectValue\n",
30+
expectError: true,
31+
},
32+
{
33+
name: "No entry for cloud.openshift.com",
34+
data: "{\"auths\":{\"950916221866.dkr.ecr.us-east-1.amazonaws.com\":{\"auth\":\"testTokenValue\",\"email\":\"\"},\"MissingValue\":{\"auth\":\"TestAuthValue\",\"email\":\"[email protected]\"},\"pull.q1w2.quay.rhcloud.com\":{\"auth\":\"TestQuayAuthValue\"},\"quay.io\":{\"auth\":\"TestPersonalAuthValue\",\"email\":\"[email protected]\"},\"registry.ci.openshift.org\":{\"auth\":\"TestRegistry-connect-redhat-com-value\"},\"registry.connect.redhat.com\":{\"auth\":\"dWhjLXBvb2wtdGVzdC1wb29sLXZhbHVlLWhlcmU6Q29ycmVjdFZhbHVlCg==\"},\"registry.redhat.io\":{\"auth\":\"TestPersonalTokenTwo\",\"email\":\"[email protected]\"}}}",
35+
secretToken: "IncorrectValue\n",
36+
expectError: true,
37+
expectedNote: "cloud.openshift.com value not found in clusterPullSecret",
38+
},
39+
}
40+
41+
for _, tt := range tests {
42+
t.Run(tt.name, func(t *testing.T) {
43+
secretTest := &corev1.Secret{
44+
ObjectMeta: v1.ObjectMeta{
45+
Name: "pull-secret",
46+
Namespace: "openshift-config",
47+
},
48+
Type: corev1.DockerConfigJsonKey,
49+
Data: map[string][]byte{
50+
".dockerconfigjson": []byte(tt.data),
51+
},
52+
}
53+
k8scli := fake.NewClientBuilder().WithObjects(secretTest).Build()
54+
result, note, _ := getClusterPullSecret(k8scli)
55+
if result != tt.secretToken {
56+
if !strings.Contains(note, tt.expectedNote) {
57+
t.Errorf("Expected note message: %s. Got %s", tt.expectedNote, note)
58+
}
59+
if !tt.expectError {
60+
t.Errorf("expected token %s to match %s", result, tt.secretToken)
61+
}
62+
}
63+
})
64+
}
65+
}

pkg/ocm/ocm.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,20 @@ func (c *SdkClient) IsAccessProtected(cluster *cmv1.Cluster) (bool, error) {
310310
return enabled, nil
311311
}
312312

313+
func CheckIfUserBanned(ocmClient Client, cluster *cmv1.Cluster) (bool, string, error) {
314+
user, err := GetCreatorFromCluster(ocmClient.GetConnection(), cluster)
315+
if err != nil {
316+
return false, "encountered an issue when checking if the cluster owner is banned. Please investigate.", err
317+
}
318+
319+
if user.Banned() {
320+
noteMessage := fmt.Sprintf("User is banned %s. Ban description %s.\n Please open a proactive case, so that MCS can resolve the ban or organize a ownership transfer.", user.BanCode(), user.BanDescription())
321+
logging.Warnf(noteMessage)
322+
return true, noteMessage, nil
323+
}
324+
return false, "User is not banned.", nil
325+
}
326+
313327
func GetCreatorFromCluster(ocmConn *sdk.Connection, cluster *cmv1.Cluster) (*amv1.Account, error) {
314328
logging.Debugf("Getting subscription from cluster: %s", cluster.ID())
315329
cmv1Subscription, ok := cluster.GetSubscription()
@@ -336,3 +350,21 @@ func GetCreatorFromCluster(ocmConn *sdk.Connection, cluster *cmv1.Cluster) (*amv
336350
}
337351
return creator, nil
338352
}
353+
354+
func GetOCMPullSecret(ocmConn *sdk.Connection, userID string) (string, error) {
355+
searchString := fmt.Sprintf("account_id = '%s'", userID)
356+
var registryCredentialToken string
357+
registryCredentials, err := ocmConn.AccountsMgmt().V1().RegistryCredentials().List().Search(searchString).Send()
358+
if err != nil {
359+
return "", err
360+
}
361+
for _, tempToken := range registryCredentials.Items().Items() {
362+
if tempToken.Registry().ID() == "Redhat_registry.redhat.io" {
363+
registryCredentialToken = tempToken.Token()
364+
}
365+
}
366+
if registryCredentialToken == "" {
367+
return "", errors.New("failed to parse pull secret from OCM")
368+
}
369+
return registryCredentialToken, nil
370+
}

0 commit comments

Comments
 (0)