7
7
"context"
8
8
"fmt"
9
9
"os"
10
+ "strings"
10
11
"time"
11
12
12
13
"github.com/aws/aws-sdk-go-v2/config"
@@ -17,21 +18,24 @@ import (
17
18
. "github.com/onsi/gomega"
18
19
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
19
20
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
21
+ "github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
20
22
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
21
23
"github.com/openshift/osde2e-common/pkg/clients/openshift"
22
24
appsv1 "k8s.io/api/apps/v1"
25
+ corev1 "k8s.io/api/core/v1"
23
26
"k8s.io/client-go/util/retry"
24
27
logger "sigs.k8s.io/controller-runtime/pkg/log"
25
28
)
26
29
27
30
var _ = Describe ("Configuration Anomaly Detection" , Ordered , func () {
28
31
var (
29
- ocme2eCli * ocme2e.Client
30
- ocmCli ocm.Client
31
- k8s * openshift.Client
32
- region string
33
- provider string
34
- clusterID string
32
+ ocme2eCli * ocme2e.Client
33
+ ocmCli ocm.Client
34
+ k8s * openshift.Client
35
+ region string
36
+ provider string
37
+ clusterID string
38
+ testPdClient utils.TestPagerDutyClient
35
39
)
36
40
37
41
BeforeAll (func (ctx context.Context ) {
@@ -41,7 +45,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
41
45
ocmToken := os .Getenv ("OCM_TOKEN" )
42
46
clientID := os .Getenv ("CLIENT_ID" )
43
47
clientSecret := os .Getenv ("CLIENT_SECRET" )
44
- clusterID = os .Getenv ("CLUSTER_ID " )
48
+ clusterID = os .Getenv ("OCM_CLUSTER_ID " )
45
49
cadOcmFilePath := os .Getenv ("CAD_OCM_FILE_PATH" )
46
50
47
51
Expect (ocmToken ).NotTo (BeEmpty (), "OCM_TOKEN must be set" )
@@ -62,6 +66,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
62
66
63
67
provider , err = k8s .GetProvider (ctx )
64
68
Expect (err ).NotTo (HaveOccurred (), "Could not determine provider" )
69
+
70
+ pdRoutingKey := os .Getenv ("CAD_PAGERDUTY_ROUTING_KEY" )
71
+ Expect (pdRoutingKey ).NotTo (BeEmpty (), "PAGERDUTY_ROUTING_KEY must be set" )
72
+ testPdClient = utils .NewClient (pdRoutingKey )
65
73
})
66
74
67
75
AfterAll (func () {
@@ -76,7 +84,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
76
84
awsSecretKey := os .Getenv ("AWS_SECRET_ACCESS_KEY" )
77
85
Expect (awsAccessKey ).NotTo (BeEmpty (), "AWS access key not found" )
78
86
Expect (awsSecretKey ).NotTo (BeEmpty (), "AWS secret key not found" )
79
-
80
87
awsCfg , err := config .LoadDefaultConfig (ctx ,
81
88
config .WithRegion (region ),
82
89
config .WithCredentialsProvider (credentials .NewStaticCredentialsProvider (
@@ -86,38 +93,43 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
86
93
)),
87
94
)
88
95
Expect (err ).NotTo (HaveOccurred (), "Failed to create AWS config" )
89
-
90
96
ec2Client := ec2 .NewFromConfig (awsCfg )
91
- ec2Wrapper := NewEC2ClientWrapper (ec2Client )
92
-
97
+ ec2Wrapper := utils .NewEC2ClientWrapper (ec2Client )
93
98
awsCli , err := awsinternal .NewClient (awsCfg )
94
99
Expect (err ).NotTo (HaveOccurred (), "Failed to create AWS client" )
95
-
96
100
clusterResource , err := ocme2eCli .ClustersMgmt ().V1 ().Clusters ().Cluster (clusterID ).Get ().Send ()
97
101
Expect (err ).NotTo (HaveOccurred (), "Failed to fetch cluster from OCM" )
98
-
99
102
cluster := clusterResource .Body ()
100
103
infraID := cluster .InfraID ()
101
104
Expect (infraID ).NotTo (BeEmpty (), "InfraID missing from cluster" )
102
-
103
105
sgID , err := awsCli .GetSecurityGroupID (infraID )
104
106
Expect (err ).NotTo (HaveOccurred (), "Failed to get security group ID" )
105
-
106
107
// Get limited support reasons before blocking egress
107
- lsResponseBefore , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
108
+ lsResponseBefore , err := utils . GetLimitedSupportReasons (ocme2eCli , clusterID )
108
109
Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
109
110
lsReasonsBefore := lsResponseBefore .Items ().Len ()
110
-
111
111
ginkgo .GinkgoWriter .Printf ("Limited support reasons before blocking egress: %d\n " , lsReasonsBefore )
112
112
ginkgo .GinkgoWriter .Printf ("Blocking egress for security group: %s\n " , sgID )
113
-
114
113
// Block egress
115
- Expect (BlockEgress (ctx , ec2Wrapper , sgID )).To (Succeed (), "Failed to block egress" )
114
+ Expect (utils . BlockEgress (ctx , ec2Wrapper , sgID )).To (Succeed (), "Failed to block egress" )
116
115
ginkgo .GinkgoWriter .Printf ("Egress blocked\n " )
117
116
118
- time .Sleep (20 * time .Minute )
117
+ // Clean up: restore egress - moved up to minimize risk of exits before cleanup
118
+ defer func () {
119
+ err := utils .RestoreEgress (ctx , ec2Wrapper , sgID )
120
+ if err != nil {
121
+ ginkgo .GinkgoWriter .Printf ("Failed to restore egress: %v\n " , err )
122
+ } else {
123
+ ginkgo .GinkgoWriter .Printf ("Egress restored\n " )
124
+ }
125
+ }()
126
+
127
+ _ , err = testPdClient .TriggerIncident ("ClusterHasGoneMissing" , clusterID )
128
+ Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
119
129
120
- lsResponseAfter , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
130
+ time .Sleep (3 * time .Minute )
131
+
132
+ lsResponseAfter , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
121
133
Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
122
134
123
135
// Print the response data
@@ -128,13 +140,13 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
128
140
items := lsResponseAfter .Items ().Slice ()
129
141
for i , item := range items {
130
142
fmt .Printf ("Reason #%d:\n " , i + 1 )
131
- fmt .Printf (" - Summary: %s\n " , item .Summary ())
132
- fmt .Printf (" - Details: %s\n " , item .Details ())
143
+ fmt .Printf (" - Summary: %s\n " , item .Summary ())
144
+ fmt .Printf (" - Details: %s\n " , item .Details ())
133
145
}
134
146
135
- // Restore egress
136
- Expect (RestoreEgress ( ctx , ec2Wrapper , sgID )) .To (Succeed (), "Failed to restore egress" )
137
- ginkgo . GinkgoWriter . Printf ( "Egress restored \n " )
147
+ // Verify test result: Expect new limited support reasons to be found after blocking egress
148
+ Expect (lsResponseAfter . Items (). Len ()) .To (BeNumerically ( ">" , lsReasonsBefore ),
149
+ "No new limited support reasons found after blocking egress " )
138
150
}
139
151
})
140
152
@@ -147,11 +159,11 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
147
159
Expect (cluster ).ToNot (BeNil (), "received nil cluster from OCM" )
148
160
149
161
// Get service logs
150
- logs , err := GetServiceLogs (ocmCli , cluster )
162
+ logs , err := utils . GetServiceLogs (ocmCli , cluster )
151
163
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
152
164
logsBefore := logs .Items ().Slice ()
153
165
154
- lsResponseBefore , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
166
+ lsResponseBefore , err := utils . GetLimitedSupportReasons (ocme2eCli , clusterID )
155
167
Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
156
168
lsReasonsBefore := lsResponseBefore .Items ().Len ()
157
169
@@ -205,13 +217,16 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
205
217
Expect (err ).ToNot (HaveOccurred (), "failed to scale down alertmanager" )
206
218
fmt .Printf ("Alertmanager scaled down from %d to 0 replicas. Waiting...\n " , originalAMReplicas )
207
219
208
- time .Sleep (20 * time .Minute )
220
+ _ , err = testPdClient .TriggerIncident ("ClusterHasGoneMissing" , clusterID )
221
+ Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
222
+
223
+ time .Sleep (1 * time .Minute )
209
224
210
- logs , err = GetServiceLogs (ocmCli , cluster )
225
+ logs , err = utils . GetServiceLogs (ocmCli , cluster )
211
226
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
212
227
logsAfter := logs .Items ().Slice ()
213
228
214
- lsResponseAfter , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
229
+ lsResponseAfter , err := utils . GetLimitedSupportReasons (ocme2eCli , clusterID )
215
230
Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
216
231
lsReasonsAfter := lsResponseAfter .Items ().Len ()
217
232
@@ -266,4 +281,102 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
266
281
fmt .Println ("Test completed: All components restored to original replica counts." )
267
282
}
268
283
})
269
- })
284
+
285
+ It ("AWS CCS: Cluster has gone missing - Infra nodes turned off" , Label ("aws" , "ccs" , "infra-nodes" , "limited-support" ), func (ctx context.Context ) {
286
+ if provider != "aws" {
287
+ Skip ("This test only runs on AWS clusters" )
288
+ }
289
+ awsAccessKey := os .Getenv ("AWS_ACCESS_KEY_ID" )
290
+ awsSecretKey := os .Getenv ("AWS_SECRET_ACCESS_KEY" )
291
+ Expect (awsAccessKey ).NotTo (BeEmpty (), "AWS access key not found" )
292
+ Expect (awsSecretKey ).NotTo (BeEmpty (), "AWS secret key not found" )
293
+ awsCfg , err := config .LoadDefaultConfig (ctx ,
294
+ config .WithRegion (region ),
295
+ config .WithCredentialsProvider (credentials .NewStaticCredentialsProvider (
296
+ awsAccessKey ,
297
+ awsSecretKey ,
298
+ "" ,
299
+ )),
300
+ )
301
+ Expect (err ).NotTo (HaveOccurred (), "Failed to create AWS config" )
302
+ ec2Client := ec2 .NewFromConfig (awsCfg )
303
+
304
+ ginkgo .GinkgoWriter .Println ("Getting limited support reasons before infra node shutdown..." )
305
+ lsResponseBefore , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
306
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
307
+ lsReasonsBefore := lsResponseBefore .Items ().Len ()
308
+
309
+ ginkgo .GinkgoWriter .Printf ("Limited support reasons before infra node shutdown: %d\n " , lsReasonsBefore )
310
+
311
+ var nodeList corev1.NodeList
312
+ err = k8s .List (ctx , & nodeList )
313
+ Expect (err ).NotTo (HaveOccurred (), "Failed to list nodes" )
314
+ var instanceIDs []string
315
+ for _ , node := range nodeList .Items {
316
+ if _ , isInfra := node .Labels ["node-role.kubernetes.io/infra" ]; ! isInfra {
317
+ continue
318
+ }
319
+ providerID := node .Spec .ProviderID
320
+ Expect (providerID ).ToNot (BeEmpty (), "Infra node missing providerID" )
321
+ parts := strings .Split (providerID , "/" )
322
+ instanceIDs = append (instanceIDs , parts [len (parts )- 1 ])
323
+ }
324
+ Expect (instanceIDs ).NotTo (BeEmpty (), "No infrastructure EC2 instance IDs found" )
325
+ ginkgo .GinkgoWriter .Printf ("Found %d infra node(s) with EC2 instance IDs: %v\n " , len (instanceIDs ), instanceIDs )
326
+
327
+ // Setup deferred EC2 restart to ensure it happens regardless of test outcome
328
+ defer func () {
329
+ ginkgo .GinkgoWriter .Println ("Restarting infra nodes regardless of test status..." )
330
+ _ , err := ec2Client .StartInstances (ctx , & ec2.StartInstancesInput {
331
+ InstanceIds : instanceIDs ,
332
+ })
333
+ if err != nil {
334
+ ginkgo .GinkgoWriter .Printf ("Failed to start infra EC2 instances: %v\n " , err )
335
+ return
336
+ }
337
+ err = ec2 .NewInstanceRunningWaiter (ec2Client ).Wait (ctx , & ec2.DescribeInstancesInput {
338
+ InstanceIds : instanceIDs ,
339
+ }, 10 * time .Minute )
340
+ if err != nil {
341
+ ginkgo .GinkgoWriter .Printf ("Infra EC2 instances did not start in time: %v\n " , err )
342
+ return
343
+ }
344
+ ginkgo .GinkgoWriter .Println ("Infra nodes successfully restarted" )
345
+ }()
346
+
347
+ ginkgo .GinkgoWriter .Println ("Stopping infra nodes..." )
348
+ _ , err = ec2Client .StopInstances (ctx , & ec2.StopInstancesInput {
349
+ InstanceIds : instanceIDs ,
350
+ })
351
+ Expect (err ).NotTo (HaveOccurred (), "Failed to stop infra EC2 instances" )
352
+ err = ec2 .NewInstanceStoppedWaiter (ec2Client ).Wait (ctx , & ec2.DescribeInstancesInput {
353
+ InstanceIds : instanceIDs ,
354
+ }, 6 * time .Minute )
355
+ Expect (err ).NotTo (HaveOccurred (), "Infra EC2 instances did not stop in time" )
356
+ ginkgo .GinkgoWriter .Println ("Infra nodes successfully stopped" )
357
+
358
+ _ , err = testPdClient .TriggerIncident ("ClusterHasGoneMissing" , clusterID )
359
+ Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
360
+
361
+ ginkgo .GinkgoWriter .Println ("Sleeping for 2 minutes before checking limited support reasons..." )
362
+ time .Sleep (2 * time .Minute )
363
+
364
+ lsResponseAfter , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
365
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
366
+
367
+ // Print the response data
368
+ fmt .Println ("Limited Support Response After Stopping Infra Nodes:" )
369
+ fmt .Printf ("Total items: %d\n " , lsResponseAfter .Items ().Len ())
370
+
371
+ // Iterate through each item and print details
372
+ items := lsResponseAfter .Items ().Slice ()
373
+ for i , item := range items {
374
+ fmt .Printf ("Reason #%d:\n " , i + 1 )
375
+ fmt .Printf (" - Summary: %s\n " , item .Summary ())
376
+ fmt .Printf (" - Details: %s\n " , item .Details ())
377
+ }
378
+
379
+ Expect (lsResponseAfter .Items ().Len ()).To (BeNumerically (">" , lsReasonsBefore ),
380
+ "Expected more limited support reasons after infrastructure node shutdown" )
381
+ })
382
+ }, ginkgo .ContinueOnFailure )
0 commit comments