From a4d54d8cfdada50c7c8b43ae16f5b1e6f2b46b2c Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Mon, 12 May 2025 19:14:20 +0530 Subject: [PATCH 1/9] Final changes --- .../configuration_anomaly_detection_test.go | 105 +++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index a91fed72..85faba85 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os" + "strings" "time" "github.com/aws/aws-sdk-go-v2/config" @@ -20,6 +21,7 @@ import ( ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm" "github.com/openshift/osde2e-common/pkg/clients/openshift" appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/client-go/util/retry" logger "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -115,7 +117,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress") ginkgo.GinkgoWriter.Printf("Egress blocked\n") - time.Sleep(20 * time.Minute) + time.Sleep(1 * time.Minute) lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID) Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons") @@ -205,7 +207,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(err).ToNot(HaveOccurred(), "failed to scale down alertmanager") fmt.Printf("Alertmanager scaled down from %d to 0 replicas. Waiting...\n", originalAMReplicas) - time.Sleep(20 * time.Minute) + time.Sleep(1 * time.Minute) logs, err = GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs") @@ -266,4 +268,103 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { fmt.Println("Test completed: All components restored to original replica counts.") } }) + + It("AWS CCS: can shutdown and restart infrastructure nodes", Label("aws", "ccs", "infra-nodes", "service-logs"), func(ctx context.Context) { + if provider != "aws" { + Skip("This test only runs on AWS clusters") + } + + awsAccessKey := os.Getenv("AWS_ACCESS_KEY_ID") + awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY") + Expect(awsAccessKey).NotTo(BeEmpty(), "AWS access key not found") + Expect(awsSecretKey).NotTo(BeEmpty(), "AWS secret key not found") + + awsCfg, err := config.LoadDefaultConfig(ctx, + config.WithRegion(region), + config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + awsAccessKey, + awsSecretKey, + "", + )), + ) + Expect(err).NotTo(HaveOccurred(), "Failed to create AWS config") + + ec2Client := ec2.NewFromConfig(awsCfg) + + // Step 1: Get cluster object + clusterResp, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send() + Expect(err).ToNot(HaveOccurred(), "Failed to fetch cluster from OCM") + cluster := clusterResp.Body() + + // Step 2: Get service logs before shutdown + serviceLogsBefore, err := GetServiceLogs(ocmCli, cluster) + Expect(err).ToNot(HaveOccurred(), "Failed to get service logs before shutdown") + beforeLogIDs := map[string]bool{} + for _, log := range serviceLogsBefore.Items().Slice() { + beforeLogIDs[log.ID()] = true + } + + // Step 3: Get infra node EC2 instance IDs + var nodeList corev1.NodeList + err = k8s.List(ctx, &nodeList) + Expect(err).NotTo(HaveOccurred(), "Failed to list nodes") + + var instanceIDs []string + for _, node := range nodeList.Items { + if _, isInfra := node.Labels["node-role.kubernetes.io/infra"]; !isInfra { + continue + } + providerID := node.Spec.ProviderID + Expect(providerID).ToNot(BeEmpty(), "Infra node missing providerID") + parts := strings.Split(providerID, "/") + instanceIDs = append(instanceIDs, parts[len(parts)-1]) + } + Expect(instanceIDs).NotTo(BeEmpty(), "No infrastructure EC2 instance IDs found") + ginkgo.GinkgoWriter.Printf("Infra EC2 instance IDs: %v\n", instanceIDs) + + // Step 4: Stop EC2 instances + _, err = ec2Client.StopInstances(ctx, &ec2.StopInstancesInput{ + InstanceIds: instanceIDs, + }) + Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances") + + err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }, 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not stop in time") + ginkgo.GinkgoWriter.Println("Infra nodes successfully stopped") + + // Step 5: Wait 20 minutes + ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before restarting nodes...") + time.Sleep(2 * time.Minute) + + // Step 6: Get service logs after shutdown + serviceLogsAfter, err := GetServiceLogs(ocmCli, cluster) + Expect(err).ToNot(HaveOccurred(), "Failed to get service logs after shutdown") + + fmt.Println("New service logs generated during infra node downtime:") + newLogsFound := false + for _, log := range serviceLogsAfter.Items().Slice() { + if !beforeLogIDs[log.ID()] { + newLogsFound = true + fmt.Printf("ID: %s\nSummary: %s\nDescription: %s\n\n", log.ID(), log.Summary(), log.Description()) + } + } + if !newLogsFound { + fmt.Println("No new service logs found.") + } + + // Step 7: Start EC2 instances again + _, err = ec2Client.StartInstances(ctx, &ec2.StartInstancesInput{ + InstanceIds: instanceIDs, + }) + Expect(err).NotTo(HaveOccurred(), "Failed to start infra EC2 instances") + + err = ec2.NewInstanceRunningWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }, 10*time.Minute) + Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not start in time") + + ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted") + }) }) From d3983f7d0f65cd7f93d3d9db9da956fc5d48c48c Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Mon, 12 May 2025 19:18:18 +0530 Subject: [PATCH 2/9] Final changes --- test/e2e/configuration_anomaly_detection_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index 85faba85..700e082c 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -117,7 +117,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress") ginkgo.GinkgoWriter.Printf("Egress blocked\n") - time.Sleep(1 * time.Minute) + time.Sleep(20 * time.Minute) lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID) Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons") @@ -207,7 +207,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(err).ToNot(HaveOccurred(), "failed to scale down alertmanager") fmt.Printf("Alertmanager scaled down from %d to 0 replicas. Waiting...\n", originalAMReplicas) - time.Sleep(1 * time.Minute) + time.Sleep(20 * time.Minute) logs, err = GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs") @@ -336,7 +336,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { // Step 5: Wait 20 minutes ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before restarting nodes...") - time.Sleep(2 * time.Minute) + time.Sleep(20 * time.Minute) // Step 6: Get service logs after shutdown serviceLogsAfter, err := GetServiceLogs(ocmCli, cluster) From b56b54240a1c2082d88b11a684c3a60031b6baa0 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Fri, 16 May 2025 11:15:44 +0530 Subject: [PATCH 3/9] Made changes as per the comments recieved --- .../configuration_anomaly_detection_test.go | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index 700e082c..760103af 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -43,7 +43,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { ocmToken := os.Getenv("OCM_TOKEN") clientID := os.Getenv("CLIENT_ID") clientSecret := os.Getenv("CLIENT_SECRET") - clusterID = os.Getenv("CLUSTER_ID") + clusterID = os.Getenv("OCM_CLUSTER_ID") cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH") Expect(ocmToken).NotTo(BeEmpty(), "OCM_TOKEN must be set") @@ -273,12 +273,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { if provider != "aws" { Skip("This test only runs on AWS clusters") } - awsAccessKey := os.Getenv("AWS_ACCESS_KEY_ID") awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY") Expect(awsAccessKey).NotTo(BeEmpty(), "AWS access key not found") Expect(awsSecretKey).NotTo(BeEmpty(), "AWS secret key not found") - awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(region), config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( @@ -288,7 +286,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { )), ) Expect(err).NotTo(HaveOccurred(), "Failed to create AWS config") - ec2Client := ec2.NewFromConfig(awsCfg) // Step 1: Get cluster object @@ -308,7 +305,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { var nodeList corev1.NodeList err = k8s.List(ctx, &nodeList) Expect(err).NotTo(HaveOccurred(), "Failed to list nodes") - var instanceIDs []string for _, node := range nodeList.Items { if _, isInfra := node.Labels["node-role.kubernetes.io/infra"]; !isInfra { @@ -327,7 +323,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { InstanceIds: instanceIDs, }) Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances") - err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ InstanceIds: instanceIDs, }, 5*time.Minute) @@ -338,6 +333,20 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before restarting nodes...") time.Sleep(20 * time.Minute) + // Setup deferred EC2 restart to ensure it happens regardless of test outcome + defer func() { + ginkgo.GinkgoWriter.Println("Restarting infra nodes regardless of test status...") + _, err := ec2Client.StartInstances(ctx, &ec2.StartInstancesInput{ + InstanceIds: instanceIDs, + }) + Expect(err).NotTo(HaveOccurred(), "Failed to start infra EC2 instances") + err = ec2.NewInstanceRunningWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }, 10*time.Minute) + Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not start in time") + ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted") + }() + // Step 6: Get service logs after shutdown serviceLogsAfter, err := GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs after shutdown") @@ -350,21 +359,6 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { fmt.Printf("ID: %s\nSummary: %s\nDescription: %s\n\n", log.ID(), log.Summary(), log.Description()) } } - if !newLogsFound { - fmt.Println("No new service logs found.") - } - - // Step 7: Start EC2 instances again - _, err = ec2Client.StartInstances(ctx, &ec2.StartInstancesInput{ - InstanceIds: instanceIDs, - }) - Expect(err).NotTo(HaveOccurred(), "Failed to start infra EC2 instances") - - err = ec2.NewInstanceRunningWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ - InstanceIds: instanceIDs, - }, 10*time.Minute) - Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not start in time") - - ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted") + Expect(newLogsFound).To(BeTrue(), "No new service logs were found after infrastructure node shutdown") }) }) From 4ea9c586990a126041ff1cebd8f2ad8a00931fb0 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Fri, 16 May 2025 18:29:12 +0530 Subject: [PATCH 4/9] Made changes as per the commits --- .../configuration_anomaly_detection_test.go | 60 ++++++++++++------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index 760103af..ce77601b 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -294,8 +294,11 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { cluster := clusterResp.Body() // Step 2: Get service logs before shutdown + ginkgo.GinkgoWriter.Println("Getting service logs before infra node shutdown...") serviceLogsBefore, err := GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs before shutdown") + + // Create a map of existing log IDs for quick lookup beforeLogIDs := map[string]bool{} for _, log := range serviceLogsBefore.Items().Slice() { beforeLogIDs[log.ID()] = true @@ -316,22 +319,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { instanceIDs = append(instanceIDs, parts[len(parts)-1]) } Expect(instanceIDs).NotTo(BeEmpty(), "No infrastructure EC2 instance IDs found") - ginkgo.GinkgoWriter.Printf("Infra EC2 instance IDs: %v\n", instanceIDs) - - // Step 4: Stop EC2 instances - _, err = ec2Client.StopInstances(ctx, &ec2.StopInstancesInput{ - InstanceIds: instanceIDs, - }) - Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances") - err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ - InstanceIds: instanceIDs, - }, 5*time.Minute) - Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not stop in time") - ginkgo.GinkgoWriter.Println("Infra nodes successfully stopped") - - // Step 5: Wait 20 minutes - ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before restarting nodes...") - time.Sleep(20 * time.Minute) + ginkgo.GinkgoWriter.Printf("Found %d infra node(s) with EC2 instance IDs: %v\n", len(instanceIDs), instanceIDs) // Setup deferred EC2 restart to ensure it happens regardless of test outcome defer func() { @@ -347,18 +335,48 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted") }() + // Step 4: Stop EC2 instances + ginkgo.GinkgoWriter.Println("Stopping infra nodes...") + _, err = ec2Client.StopInstances(ctx, &ec2.StopInstancesInput{ + InstanceIds: instanceIDs, + }) + Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances") + err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }, 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not stop in time") + ginkgo.GinkgoWriter.Println("Infra nodes successfully stopped") + + // Step 5: Wait 20 minutes + ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before checking logs...") + time.Sleep(20 * time.Minute) + // Step 6: Get service logs after shutdown + ginkgo.GinkgoWriter.Println("Getting service logs after infra node shutdown...") serviceLogsAfter, err := GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs after shutdown") - fmt.Println("New service logs generated during infra node downtime:") - newLogsFound := false + var newLogs []interface{} + for _, log := range serviceLogsAfter.Items().Slice() { if !beforeLogIDs[log.ID()] { - newLogsFound = true - fmt.Printf("ID: %s\nSummary: %s\nDescription: %s\n\n", log.ID(), log.Summary(), log.Description()) + newLogs = append(newLogs, log) } } - Expect(newLogsFound).To(BeTrue(), "No new service logs were found after infrastructure node shutdown") + + if len(newLogs) > 0 { + ginkgo.GinkgoWriter.Printf("Found %d new service logs during infra node downtime:\n", len(newLogs)) + for _, logInterface := range newLogs { + log := logInterface.(interface{}) // Type assertion to access methods + ginkgo.GinkgoWriter.Printf("ID: %s\nSummary: %s\nDescription: %s\n\n", + log.(interface{ ID() string }).ID(), + log.(interface{ Summary() string }).Summary(), + log.(interface{ Description() string }).Description()) + } + } else { + ginkgo.GinkgoWriter.Println("No new service logs found after infra node shutdown") + } + + Expect(len(newLogs)).To(BeNumerically(">", 0), "No new service logs were found after infrastructure node shutdown") }) }) From 2e886c20a0b80214569aa46ac0a1a56fb480b78f Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Wed, 21 May 2025 21:21:36 +0530 Subject: [PATCH 5/9] Final changes --- .../configuration_anomaly_detection_test.go | 116 ++++++++++-------- test/e2e/generate_incident.go | 0 2 files changed, 67 insertions(+), 49 deletions(-) create mode 100644 test/e2e/generate_incident.go diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index 9778d47b..23707e1c 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -34,6 +34,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { region string provider string clusterID string + pdClient PagerDutyClient ) BeforeAll(func(ctx context.Context) { @@ -64,6 +65,12 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { provider, err = k8s.GetProvider(ctx) Expect(err).NotTo(HaveOccurred(), "Could not determine provider") + + pdRoutingKey := os.Getenv("PD_ROUTING_KEY") + pdToken := os.Getenv("PD_AUTH_TOKEN") + Expect(pdRoutingKey).NotTo(BeEmpty(), "PAGERDUTY_ROUTING_KEY must be set") + Expect(pdToken).NotTo(BeEmpty(), "PAGERDUTY_TOKEN must be set") + pdClient = NewClient(pdRoutingKey, pdToken) }) AfterAll(func() { @@ -117,7 +124,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress") ginkgo.GinkgoWriter.Printf("Egress blocked\n") - time.Sleep(20 * time.Minute) + _, err = pdClient.CreateSilentRequest("ClusterHasGoneMissing", clusterID) + Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert") + + time.Sleep(5 * time.Minute) lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID) Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons") @@ -134,9 +144,19 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { fmt.Printf(" - Details: %s\n", item.Details()) } - // Restore egress - Expect(RestoreEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to restore egress") - ginkgo.GinkgoWriter.Printf("Egress restored\n") + // Clean up: restore egress before checking test conditions + defer func() { + err := RestoreEgress(ctx, ec2Wrapper, sgID) + if err != nil { + ginkgo.GinkgoWriter.Printf("Failed to restore egress: %v\n", err) + } else { + ginkgo.GinkgoWriter.Printf("Egress restored\n") + } + }() + + // Verify test result: Expect new limited support reasons to be found after blocking egress + Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore), + "No new limited support reasons found after blocking egress") } }) @@ -207,7 +227,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(err).ToNot(HaveOccurred(), "failed to scale down alertmanager") fmt.Printf("Alertmanager scaled down from %d to 0 replicas. Waiting...\n", originalAMReplicas) - time.Sleep(20 * time.Minute) + time.Sleep(1 * time.Minute) logs, err = GetServiceLogs(ocmCli, cluster) Expect(err).ToNot(HaveOccurred(), "Failed to get service logs") @@ -269,7 +289,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { } }) - It("AWS CCS: can shutdown and restart infrastructure nodes", Label("aws", "ccs", "infra-nodes", "service-logs"), func(ctx context.Context) { + It("AWS CCS: can shutdown and restart infrastructure nodes", Label("aws", "ccs", "infra-nodes", "limited-support"), func(ctx context.Context) { if provider != "aws" { Skip("This test only runs on AWS clusters") } @@ -289,20 +309,17 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { ec2Client := ec2.NewFromConfig(awsCfg) // Step 1: Get cluster object - clusterResp, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send() - Expect(err).ToNot(HaveOccurred(), "Failed to fetch cluster from OCM") - cluster := clusterResp.Body() - - // Step 2: Get service logs before shutdown - ginkgo.GinkgoWriter.Println("Getting service logs before infra node shutdown...") - serviceLogsBefore, err := GetServiceLogs(ocmCli, cluster) - Expect(err).ToNot(HaveOccurred(), "Failed to get service logs before shutdown") - - // Create a map of existing log IDs for quick lookup - beforeLogIDs := map[string]bool{} - for _, log := range serviceLogsBefore.Items().Slice() { - beforeLogIDs[log.ID()] = true - } + //clusterResp, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send() + //Expect(err).ToNot(HaveOccurred(), "Failed to fetch cluster from OCM") + //cluster := clusterResp.Body() + + // Step 2: Get limited support reasons before shutdown + ginkgo.GinkgoWriter.Println("Getting limited support reasons before infra node shutdown...") + lsResponseBefore, err := GetLimitedSupportReasons(ocme2eCli, clusterID) + Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons") + lsReasonsBefore := lsResponseBefore.Items().Len() + + ginkgo.GinkgoWriter.Printf("Limited support reasons before infra node shutdown: %d\n", lsReasonsBefore) // Step 3: Get infra node EC2 instance IDs var nodeList corev1.NodeList @@ -327,11 +344,17 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { _, err := ec2Client.StartInstances(ctx, &ec2.StartInstancesInput{ InstanceIds: instanceIDs, }) - Expect(err).NotTo(HaveOccurred(), "Failed to start infra EC2 instances") + if err != nil { + ginkgo.GinkgoWriter.Printf("Failed to start infra EC2 instances: %v\n", err) + return + } err = ec2.NewInstanceRunningWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ InstanceIds: instanceIDs, }, 10*time.Minute) - Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not start in time") + if err != nil { + ginkgo.GinkgoWriter.Printf("Infra EC2 instances did not start in time: %v\n", err) + return + } ginkgo.GinkgoWriter.Println("Infra nodes successfully restarted") }() @@ -343,40 +366,35 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { Expect(err).NotTo(HaveOccurred(), "Failed to stop infra EC2 instances") err = ec2.NewInstanceStoppedWaiter(ec2Client).Wait(ctx, &ec2.DescribeInstancesInput{ InstanceIds: instanceIDs, - }, 5*time.Minute) + }, 6*time.Minute) Expect(err).NotTo(HaveOccurred(), "Infra EC2 instances did not stop in time") ginkgo.GinkgoWriter.Println("Infra nodes successfully stopped") - // Step 5: Wait 20 minutes - ginkgo.GinkgoWriter.Println("Sleeping for 20 minutes before checking logs...") - time.Sleep(20 * time.Minute) + _, err = pdClient.CreateSilentRequest("ClusterHasGoneMissing", clusterID) + Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert") - // Step 6: Get service logs after shutdown - ginkgo.GinkgoWriter.Println("Getting service logs after infra node shutdown...") - serviceLogsAfter, err := GetServiceLogs(ocmCli, cluster) - Expect(err).ToNot(HaveOccurred(), "Failed to get service logs after shutdown") + // Step 5: Wait for some time + ginkgo.GinkgoWriter.Println("Sleeping for 2 minutes before checking limited support reasons...") + time.Sleep(2 * time.Minute) - var newLogs []interface{} + // Step 6: Get limited support reasons after shutdown + lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID) + Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons") - for _, log := range serviceLogsAfter.Items().Slice() { - if !beforeLogIDs[log.ID()] { - newLogs = append(newLogs, log) - } - } + // Print the response data + fmt.Println("Limited Support Response After Stopping Infra Nodes:") + fmt.Printf("Total items: %d\n", lsResponseAfter.Items().Len()) - if len(newLogs) > 0 { - ginkgo.GinkgoWriter.Printf("Found %d new service logs during infra node downtime:\n", len(newLogs)) - for _, logInterface := range newLogs { - log := logInterface.(interface{}) // Type assertion to access methods - ginkgo.GinkgoWriter.Printf("ID: %s\nSummary: %s\nDescription: %s\n\n", - log.(interface{ ID() string }).ID(), - log.(interface{ Summary() string }).Summary(), - log.(interface{ Description() string }).Description()) - } - } else { - ginkgo.GinkgoWriter.Println("No new service logs found after infra node shutdown") + // Iterate through each item and print details + items := lsResponseAfter.Items().Slice() + for i, item := range items { + fmt.Printf("Reason #%d:\n", i+1) + fmt.Printf(" - Summary: %s\n", item.Summary()) + fmt.Printf(" - Details: %s\n", item.Details()) } - Expect(len(newLogs)).To(BeNumerically(">", 0), "No new service logs were found after infrastructure node shutdown") + // Step 7: Check if limited support reasons changed + Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore), + "Expected more limited support reasons after infrastructure node shutdown") }) -}) +}, ginkgo.ContinueOnFailure) diff --git a/test/e2e/generate_incident.go b/test/e2e/generate_incident.go new file mode 100644 index 00000000..e69de29b From 545122878c2dbaced8a03f3888d9baba3dc691c8 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Wed, 21 May 2025 21:29:02 +0530 Subject: [PATCH 6/9] Changed the PagerDuty token --- test/e2e/configuration_anomaly_detection_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go index 23707e1c..27f2abe4 100644 --- a/test/e2e/configuration_anomaly_detection_test.go +++ b/test/e2e/configuration_anomaly_detection_test.go @@ -66,8 +66,8 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() { provider, err = k8s.GetProvider(ctx) Expect(err).NotTo(HaveOccurred(), "Could not determine provider") - pdRoutingKey := os.Getenv("PD_ROUTING_KEY") - pdToken := os.Getenv("PD_AUTH_TOKEN") + pdRoutingKey := os.Getenv("CAD_PD_TOKEN") + pdToken := os.Getenv("CAD_PD_TOKEN") Expect(pdRoutingKey).NotTo(BeEmpty(), "PAGERDUTY_ROUTING_KEY must be set") Expect(pdToken).NotTo(BeEmpty(), "PAGERDUTY_TOKEN must be set") pdClient = NewClient(pdRoutingKey, pdToken) From 60759ed808cab91951a23796fb013f1b6b1759f6 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Wed, 21 May 2025 22:07:32 +0530 Subject: [PATCH 7/9] Fixed lint issue --- test/e2e/generate_incident.go | 184 ++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/test/e2e/generate_incident.go b/test/e2e/generate_incident.go index e69de29b..b9b56db9 100644 --- a/test/e2e/generate_incident.go +++ b/test/e2e/generate_incident.go @@ -0,0 +1,184 @@ +package osde2etests + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "time" + + "github.com/google/uuid" +) + +type PagerDutyClient interface { + CreateSilentRequest(alertName, clusterID string) (string, error) + GetIncidentID(dedupKey string) (string, error) + ResolveIncident(incidentID string) error +} + +type client struct { + eventsURL string + apiURL string + routingKey string + authToken string + alertMappings map[string]string + httpClient *http.Client +} + +func NewClient(routingKey, authToken string) PagerDutyClient { + return &client{ + eventsURL: "https://events.pagerduty.com/v2/enqueue", + apiURL: "https://api.pagerduty.com/incidents", + routingKey: routingKey, + authToken: authToken, + alertMappings: map[string]string{ + "ClusterHasGoneMissing": "cadtest has gone missing", + "ClusterProvisioningDelay": "ClusterProvisioningDelay -", + "ClusterMonitoringErrorBudgetBurnSRE": "ClusterMonitoringErrorBudgetBurnSRE Critical (1)", + "InsightsOperatorDown": "InsightsOperatorDown", + "MachineHealthCheckUnterminatedShortCircuitSRE": "MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)", + "ApiErrorBudgetBurn": "api-ErrorBudgetBurn k8sgpt test CRITICAL (1)", + }, + httpClient: &http.Client{Timeout: 10 * time.Second}, + } +} + +type payload struct { + Payload struct { + Summary string `json:"summary"` + Timestamp string `json:"timestamp"` + Severity string `json:"severity"` + Source string `json:"source"` + Details map[string]string `json:"custom_details"` + } `json:"payload"` + RoutingKey string `json:"routing_key"` + EventAction string `json:"event_action"` + DedupKey string `json:"dedup_key"` +} + +func (c *client) CreateSilentRequest(alertName, clusterID string) (string, error) { + title, ok := c.alertMappings[alertName] + if !ok { + return "", fmt.Errorf("unknown alert name: %s", alertName) + } + + dedupKey := generateUUID() + + now := time.Now().UTC().Format(time.RFC3339) + p := payload{ + RoutingKey: c.routingKey, + EventAction: "trigger", + DedupKey: dedupKey, + } + p.Payload.Summary = title + p.Payload.Timestamp = now + p.Payload.Severity = "critical" + p.Payload.Source = "cad-integration-testing" + p.Payload.Details = map[string]string{ + "alertname": alertName, + "cluster_id": clusterID, + } + + body, err := json.Marshal(p) + if err != nil { + return "", err + } + + req, err := http.NewRequest("POST", c.eventsURL, bytes.NewBuffer(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read response body: %v", err) + } + + if resp.StatusCode != http.StatusAccepted && resp.StatusCode != http.StatusOK { + return "", errors.New("failed to trigger alert - RespBody : " + string(respBody)) + } + + // Sleep to give time for the incident to be indexed + time.Sleep(2 * time.Second) + + return dedupKey, nil +} + +func (c *client) GetIncidentID(dedupKey string) (string, error) { + url := fmt.Sprintf("%s?incident_key=%s", c.apiURL, dedupKey) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return "", err + } + req.Header.Set("Authorization", "Token token="+c.authToken) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + var result struct { + Incidents []struct { + ID string `json:"id"` + } `json:"incidents"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", err + } + + if len(result.Incidents) == 0 { + return "", errors.New("incident not found") + } + return result.Incidents[0].ID, nil +} + +func (c *client) ResolveIncident(incidentID string) error { + url := fmt.Sprintf("%s/%s", c.apiURL, incidentID) + + payload := map[string]interface{}{ + "incident": map[string]string{ + "type": "incident_reference", + "status": "resolved", + }, + } + body, err := json.Marshal(payload) + if err != nil { + return err + } + + req, err := http.NewRequest("PUT", url, bytes.NewBuffer(body)) + if err != nil { + return err + } + req.Header.Set("Authorization", "Token token="+c.authToken) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/vnd.pagerduty+json;version=2") + + resp, err := c.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to resolve incident, status code: %d", resp.StatusCode) + } + + return nil +} + +func generateUUID() string { + return uuid.New().String() +} From 8df9a97875898cd3ccd91107335a22cd89e05d36 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Wed, 21 May 2025 22:53:03 +0530 Subject: [PATCH 8/9] Fixed lint issue --- test/e2e/generate_incident.go | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/e2e/generate_incident.go b/test/e2e/generate_incident.go index b9b56db9..c1415281 100644 --- a/test/e2e/generate_incident.go +++ b/test/e2e/generate_incident.go @@ -2,6 +2,7 @@ package osde2etests import ( "bytes" + "context" "encoding/json" "errors" "fmt" @@ -86,7 +87,8 @@ func (c *client) CreateSilentRequest(alertName, clusterID string) (string, error return "", err } - req, err := http.NewRequest("POST", c.eventsURL, bytes.NewBuffer(body)) + ctx := context.Background() + req, err := http.NewRequestWithContext(ctx, "POST", c.eventsURL, bytes.NewBuffer(body)) if err != nil { return "", err } @@ -97,10 +99,14 @@ func (c *client) CreateSilentRequest(alertName, clusterID string) (string, error if err != nil { return "", err } - defer resp.Body.Close() + defer func() { + if cerr := resp.Body.Close(); cerr != nil && err == nil { + err = cerr + } + }() respBody, err := io.ReadAll(resp.Body) if err != nil { - return "", fmt.Errorf("failed to read response body: %v", err) + return "", fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusAccepted && resp.StatusCode != http.StatusOK { @@ -115,7 +121,9 @@ func (c *client) CreateSilentRequest(alertName, clusterID string) (string, error func (c *client) GetIncidentID(dedupKey string) (string, error) { url := fmt.Sprintf("%s?incident_key=%s", c.apiURL, dedupKey) - req, err := http.NewRequest("GET", url, nil) + + ctx := context.Background() + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return "", err } @@ -127,7 +135,11 @@ func (c *client) GetIncidentID(dedupKey string) (string, error) { if err != nil { return "", err } - defer resp.Body.Close() + defer func() { + if cerr := resp.Body.Close(); cerr != nil && err == nil { + err = cerr + } + }() var result struct { Incidents []struct { @@ -158,7 +170,8 @@ func (c *client) ResolveIncident(incidentID string) error { return err } - req, err := http.NewRequest("PUT", url, bytes.NewBuffer(body)) + ctx := context.Background() + req, err := http.NewRequestWithContext(ctx, "PUT", url, bytes.NewBuffer(body)) if err != nil { return err } @@ -170,7 +183,11 @@ func (c *client) ResolveIncident(incidentID string) error { if err != nil { return err } - defer resp.Body.Close() + defer func() { + if cerr := resp.Body.Close(); cerr != nil && err == nil { + err = cerr + } + }() if resp.StatusCode != http.StatusOK { return fmt.Errorf("failed to resolve incident, status code: %d", resp.StatusCode) From c250a8e0830f1aeb5c11cc78809423a769973cd5 Mon Sep 17 00:00:00 2001 From: Sairatnam Trinagari Date: Thu, 22 May 2025 00:03:00 +0530 Subject: [PATCH 9/9] Fixed lint issue --- test/e2e/generate_incident.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/generate_incident.go b/test/e2e/generate_incident.go index c1415281..ed64d1a9 100644 --- a/test/e2e/generate_incident.go +++ b/test/e2e/generate_incident.go @@ -88,7 +88,7 @@ func (c *client) CreateSilentRequest(alertName, clusterID string) (string, error } ctx := context.Background() - req, err := http.NewRequestWithContext(ctx, "POST", c.eventsURL, bytes.NewBuffer(body)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.eventsURL, bytes.NewBuffer(body)) if err != nil { return "", err } @@ -123,7 +123,7 @@ func (c *client) GetIncidentID(dedupKey string) (string, error) { url := fmt.Sprintf("%s?incident_key=%s", c.apiURL, dedupKey) ctx := context.Background() - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return "", err } @@ -171,7 +171,7 @@ func (c *client) ResolveIncident(incidentID string) error { } ctx := context.Background() - req, err := http.NewRequestWithContext(ctx, "PUT", url, bytes.NewBuffer(body)) + req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, bytes.NewBuffer(body)) if err != nil { return err }