Skip to content

Commit b1691b0

Browse files
authored
OSD-29470: To create E2E Tests for CAD - Cluster has gone missing - Infra Nodes turned off (#441)
* Final changes * Final changes * Made changes as per the comments recieved * Made changes as per the commits * Final changes * Changed the PagerDuty token * Fixed lint issue * Fixed lint issue * Fixed lint issue * Made all the changes as per the comments * Made all the changes as per the comments * Made all the changes as per the comments * Made changes as per the comments * Made changes as per the comments * Made changes as per the comments
1 parent 71ed508 commit b1691b0

File tree

5 files changed

+253
-35
lines changed

5 files changed

+253
-35
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ require (
158158
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
159159
github.com/google/s2a-go v0.1.9 // indirect
160160
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
161-
github.com/google/uuid v1.6.0 // indirect
161+
github.com/google/uuid v1.6.0 //indirect
162162
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
163163
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
164164
github.com/gorilla/css v1.0.1 // indirect

test/e2e/configuration_anomaly_detection_test.go

Lines changed: 145 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"fmt"
99
"os"
10+
"strings"
1011
"time"
1112

1213
"github.com/aws/aws-sdk-go-v2/config"
@@ -17,21 +18,24 @@ import (
1718
. "github.com/onsi/gomega"
1819
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
1920
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
21+
"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
2022
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
2123
"github.com/openshift/osde2e-common/pkg/clients/openshift"
2224
appsv1 "k8s.io/api/apps/v1"
25+
corev1 "k8s.io/api/core/v1"
2326
"k8s.io/client-go/util/retry"
2427
logger "sigs.k8s.io/controller-runtime/pkg/log"
2528
)
2629

2730
var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
2831
var (
29-
ocme2eCli *ocme2e.Client
30-
ocmCli ocm.Client
31-
k8s *openshift.Client
32-
region string
33-
provider string
34-
clusterID string
32+
ocme2eCli *ocme2e.Client
33+
ocmCli ocm.Client
34+
k8s *openshift.Client
35+
region string
36+
provider string
37+
clusterID string
38+
testPdClient utils.TestPagerDutyClient
3539
)
3640

3741
BeforeAll(func(ctx context.Context) {
@@ -41,7 +45,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
4145
ocmToken := os.Getenv("OCM_TOKEN")
4246
clientID := os.Getenv("CLIENT_ID")
4347
clientSecret := os.Getenv("CLIENT_SECRET")
44-
clusterID = os.Getenv("CLUSTER_ID")
48+
clusterID = os.Getenv("OCM_CLUSTER_ID")
4549
cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH")
4650

4751
Expect(ocmToken).NotTo(BeEmpty(), "OCM_TOKEN must be set")
@@ -62,6 +66,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
6266

6367
provider, err = k8s.GetProvider(ctx)
6468
Expect(err).NotTo(HaveOccurred(), "Could not determine provider")
69+
70+
pdRoutingKey := os.Getenv("CAD_PAGERDUTY_ROUTING_KEY")
71+
Expect(pdRoutingKey).NotTo(BeEmpty(), "PAGERDUTY_ROUTING_KEY must be set")
72+
testPdClient = utils.NewClient(pdRoutingKey)
6573
})
6674

6775
AfterAll(func() {
@@ -76,7 +84,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
7684
awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY")
7785
Expect(awsAccessKey).NotTo(BeEmpty(), "AWS access key not found")
7886
Expect(awsSecretKey).NotTo(BeEmpty(), "AWS secret key not found")
79-
8087
awsCfg, err := config.LoadDefaultConfig(ctx,
8188
config.WithRegion(region),
8289
config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
@@ -86,38 +93,43 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
8693
)),
8794
)
8895
Expect(err).NotTo(HaveOccurred(), "Failed to create AWS config")
89-
9096
ec2Client := ec2.NewFromConfig(awsCfg)
91-
ec2Wrapper := NewEC2ClientWrapper(ec2Client)
92-
97+
ec2Wrapper := utils.NewEC2ClientWrapper(ec2Client)
9398
awsCli, err := awsinternal.NewClient(awsCfg)
9499
Expect(err).NotTo(HaveOccurred(), "Failed to create AWS client")
95-
96100
clusterResource, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send()
97101
Expect(err).NotTo(HaveOccurred(), "Failed to fetch cluster from OCM")
98-
99102
cluster := clusterResource.Body()
100103
infraID := cluster.InfraID()
101104
Expect(infraID).NotTo(BeEmpty(), "InfraID missing from cluster")
102-
103105
sgID, err := awsCli.GetSecurityGroupID(infraID)
104106
Expect(err).NotTo(HaveOccurred(), "Failed to get security group ID")
105-
106107
// Get limited support reasons before blocking egress
107-
lsResponseBefore, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
108+
lsResponseBefore, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
108109
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
109110
lsReasonsBefore := lsResponseBefore.Items().Len()
110-
111111
ginkgo.GinkgoWriter.Printf("Limited support reasons before blocking egress: %d\n", lsReasonsBefore)
112112
ginkgo.GinkgoWriter.Printf("Blocking egress for security group: %s\n", sgID)
113-
114113
// Block egress
115-
Expect(BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress")
114+
Expect(utils.BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress")
116115
ginkgo.GinkgoWriter.Printf("Egress blocked\n")
117116

118-
time.Sleep(20 * time.Minute)
117+
// Clean up: restore egress - moved up to minimize risk of exits before cleanup
118+
defer func() {
119+
err := utils.RestoreEgress(ctx, ec2Wrapper, sgID)
120+
if err != nil {
121+
ginkgo.GinkgoWriter.Printf("Failed to restore egress: %v\n", err)
122+
} else {
123+
ginkgo.GinkgoWriter.Printf("Egress restored\n")
124+
}
125+
}()
126+
127+
_, err = testPdClient.TriggerIncident("ClusterHasGoneMissing", clusterID)
128+
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
119129

120-
lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
130+
time.Sleep(3 * time.Minute)
131+
132+
lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
121133
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
122134

123135
// Print the response data
@@ -128,13 +140,13 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
128140
items := lsResponseAfter.Items().Slice()
129141
for i, item := range items {
130142
fmt.Printf("Reason #%d:\n", i+1)
131-
fmt.Printf(" - Summary: %s\n", item.Summary())
132-
fmt.Printf(" - Details: %s\n", item.Details())
143+
fmt.Printf(" - Summary: %s\n", item.Summary())
144+
fmt.Printf(" - Details: %s\n", item.Details())
133145
}
134146

135-
// Restore egress
136-
Expect(RestoreEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to restore egress")
137-
ginkgo.GinkgoWriter.Printf("Egress restored\n")
147+
// Verify test result: Expect new limited support reasons to be found after blocking egress
148+
Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore),
149+
"No new limited support reasons found after blocking egress")
138150
}
139151
})
140152

@@ -147,11 +159,11 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
147159
Expect(cluster).ToNot(BeNil(), "received nil cluster from OCM")
148160

149161
// Get service logs
150-
logs, err := GetServiceLogs(ocmCli, cluster)
162+
logs, err := utils.GetServiceLogs(ocmCli, cluster)
151163
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
152164
logsBefore := logs.Items().Slice()
153165

154-
lsResponseBefore, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
166+
lsResponseBefore, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
155167
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
156168
lsReasonsBefore := lsResponseBefore.Items().Len()
157169

@@ -205,13 +217,16 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
205217
Expect(err).ToNot(HaveOccurred(), "failed to scale down alertmanager")
206218
fmt.Printf("Alertmanager scaled down from %d to 0 replicas. Waiting...\n", originalAMReplicas)
207219

208-
time.Sleep(20 * time.Minute)
220+
_, err = testPdClient.TriggerIncident("ClusterHasGoneMissing", clusterID)
221+
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
222+
223+
time.Sleep(1 * time.Minute)
209224

210-
logs, err = GetServiceLogs(ocmCli, cluster)
225+
logs, err = utils.GetServiceLogs(ocmCli, cluster)
211226
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
212227
logsAfter := logs.Items().Slice()
213228

214-
lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
229+
lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
215230
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
216231
lsReasonsAfter := lsResponseAfter.Items().Len()
217232

@@ -266,4 +281,102 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
266281
fmt.Println("Test completed: All components restored to original replica counts.")
267282
}
268283
})
269-
})
284+
285+
It("AWS CCS: Cluster has gone missing - Infra nodes turned off", Label("aws", "ccs", "infra-nodes", "limited-support"), func(ctx context.Context) {
286+
if provider != "aws" {
287+
Skip("This test only runs on AWS clusters")
288+
}
289+
awsAccessKey := os.Getenv("AWS_ACCESS_KEY_ID")
290+
awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY")
291+
Expect(awsAccessKey).NotTo(BeEmpty(), "AWS access key not found")
292+
Expect(awsSecretKey).NotTo(BeEmpty(), "AWS secret key not found")
293+
awsCfg, err := config.LoadDefaultConfig(ctx,
294+
config.WithRegion(region),
295+
config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
296+
awsAccessKey,
297+
awsSecretKey,
298+
"",
299+
)),
300+
)
301+
Expect(err).NotTo(HaveOccurred(), "Failed to create AWS config")
302+
ec2Client := ec2.NewFromConfig(awsCfg)
303+
304+
ginkgo.GinkgoWriter.Println("Getting limited support reasons before infra node shutdown...")
305+
lsResponseBefore, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
306+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
307+
lsReasonsBefore := lsResponseBefore.Items().Len()
308+
309+
ginkgo.GinkgoWriter.Printf("Limited support reasons before infra node shutdown: %d\n", lsReasonsBefore)
310+
311+
var nodeList corev1.NodeList
312+
err = k8s.List(ctx, &nodeList)
313+
Expect(err).NotTo(HaveOccurred(), "Failed to list nodes")
314+
var instanceIDs []string
315+
for _, node := range nodeList.Items {
316+
if _, isInfra := node.Labels["node-role.kubernetes.io/infra"]; !isInfra {
317+
continue
318+
}
319+
providerID := node.Spec.ProviderID
320+
Expect(providerID).ToNot(BeEmpty(), "Infra node missing providerID")
321+
parts := strings.Split(providerID, "/")
322+
instanceIDs = append(instanceIDs, parts[len(parts)-1])
323+
}
324+
Expect(instanceIDs).NotTo(BeEmpty(), "No infrastructure EC2 instance IDs found")
325+
ginkgo.GinkgoWriter.Printf("Found %d infra node(s) with EC2 instance IDs: %v\n", len(instanceIDs), instanceIDs)
326+
327+
// Setup deferred EC2 restart to ensure it happens regardless of test outcome
328+
defer func() {
329+
ginkgo.GinkgoWriter.Println("Restarting infra nodes regardless of test status...")
330+
_, err := ec2Client.StartInstances(ctx, &ec2.StartInstancesInput{
331+
InstanceIds: instanceIDs,
332+
})
333+
if err != nil {
334+
ginkgo.GinkgoWriter.Printf("Failed to start infra EC2 instances: %v\n", err)
335+
return
336+
}
337+
err = ec2.NewInstanceRunningWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{
338+
InstanceIds: instanceIDs,
339+
}, 10*time.Minute)
340+
if err != nil {
341+
ginkgo.GinkgoWriter.Printf("Infra EC2 instances did not start in time: %v\n", err)
342+
return
343+
}
344+
ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted")
345+
}()
346+
347+
ginkgo.GinkgoWriter.Println("Stopping infra nodes...")
348+
_, err = ec2Client.StopInstances(ctx, &ec2.StopInstancesInput{
349+
InstanceIds: instanceIDs,
350+
})
351+
Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances")
352+
err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{
353+
InstanceIds: instanceIDs,
354+
}, 6*time.Minute)
355+
Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not stop in time")
356+
ginkgo.GinkgoWriter.Println("Infra nodes successfully stopped")
357+
358+
_, err = testPdClient.TriggerIncident("ClusterHasGoneMissing", clusterID)
359+
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
360+
361+
ginkgo.GinkgoWriter.Println("Sleeping for 2 minutes before checking limited support reasons...")
362+
time.Sleep(2 * time.Minute)
363+
364+
lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
365+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
366+
367+
// Print the response data
368+
fmt.Println("Limited Support Response After Stopping Infra Nodes:")
369+
fmt.Printf("Total items: %d\n", lsResponseAfter.Items().Len())
370+
371+
// Iterate through each item and print details
372+
items := lsResponseAfter.Items().Slice()
373+
for i, item := range items {
374+
fmt.Printf("Reason #%d:\n", i+1)
375+
fmt.Printf(" - Summary: %s\n", item.Summary())
376+
fmt.Printf(" - Details: %s\n", item.Details())
377+
}
378+
379+
Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore),
380+
"Expected more limited support reasons after infrastructure node shutdown")
381+
})
382+
}, ginkgo.ContinueOnFailure)

test/e2e/aws.go renamed to test/e2e/utils/aws.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package osde2etests
1+
package utils
22

33
import (
44
"context"

0 commit comments

Comments
 (0)