From 6a129282bae5c69dfb0d1fceba08f5bec4e4b995 Mon Sep 17 00:00:00 2001 From: Vela Wu Date: Tue, 7 Jan 2025 18:25:00 +0000 Subject: [PATCH] fix the security group issue --- .../nvidia-inference/bert_inference_test.go | 2 +- .../nvidia-training/bert_training_test.go | 2 +- kubetest2/internal/deployers/eksapi/node.go | 116 ++++++++++++ .../templates/unmanaged-nodegroup-efa.yaml | 174 ++++++++++-------- 4 files changed, 219 insertions(+), 75 deletions(-) diff --git a/e2e2/test/cases/nvidia-inference/bert_inference_test.go b/e2e2/test/cases/nvidia-inference/bert_inference_test.go index 9a6396510..35fc021e1 100644 --- a/e2e2/test/cases/nvidia-inference/bert_inference_test.go +++ b/e2e2/test/cases/nvidia-inference/bert_inference_test.go @@ -71,7 +71,7 @@ func TestBertInference(t *testing.T) { } err := wait.For( fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), - wait.WithTimeout(20*time.Minute), + wait.WithTimeout(60*time.Minute), ) if err != nil { t.Fatalf("[ERROR] BERT inference job did not succeed: %v", err) diff --git a/e2e2/test/cases/nvidia-training/bert_training_test.go b/e2e2/test/cases/nvidia-training/bert_training_test.go index 1c1566155..f25bc8a9c 100644 --- a/e2e2/test/cases/nvidia-training/bert_training_test.go +++ b/e2e2/test/cases/nvidia-training/bert_training_test.go @@ -67,7 +67,7 @@ func TestBertTraining(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), - wait.WithTimeout(time.Minute*20)) + wait.WithTimeout(time.Minute*60)) if err != nil { t.Fatal(err) } diff --git a/kubetest2/internal/deployers/eksapi/node.go b/kubetest2/internal/deployers/eksapi/node.go index 556d84e58..5b1762efc 100644 --- a/kubetest2/internal/deployers/eksapi/node.go +++ b/kubetest2/internal/deployers/eksapi/node.go @@ -616,6 +616,20 @@ func (m *nodeManager) deleteUnmanagedNodegroup() error { } return fmt.Errorf("failed to delete unmanaged nodegroup stack: %w", err) } + + efaSecurityGroupID, err := m.getEFASecurityGroupIDFromStack(stackName) + if err != nil { + return fmt.Errorf("failed to get EFASecurityGroup ID from stack: %w", err) + } + + if efaSecurityGroupID != "" { + klog.Infof("clean up leakage ENIs in EFA Security Group") + err = m.cleanupLeakageENIs(efaSecurityGroupID) + if err != nil { + return fmt.Errorf("failed to wait for ASG deletion: %w", err) + } + } + klog.Infof("waiting for unmanaged nodegroup stack to be deleted: %s", stackName) err = cloudformation.NewStackDeleteCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), @@ -630,6 +644,108 @@ func (m *nodeManager) deleteUnmanagedNodegroup() error { return nil } +func (m *nodeManager) cleanupLeakageENIs(efaSecurityGroupID string) error { + klog.Infof("waiting for ASG in stack to be deleted: %s", m.resourceID) + err := m.waitForASGDeletion(m.resourceID) + if err != nil { + return fmt.Errorf("failed to wait for ASG deletion: %w", err) + } + + klog.Infof("cleaning up ENIs attached to EFASecurityGroup: %s", efaSecurityGroupID) + err = m.cleanupEFASecurityGroupENIs(efaSecurityGroupID) + if err != nil { + return fmt.Errorf("failed to clean up EFASecurityGroup ENIs: %w", err) + } + return nil +} + +func (m *nodeManager) waitForASGDeletion(asgName string) error { + ctx, cancel := context.WithTimeout(context.Background(), 20 * time.Minute) + defer cancel() + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out waiting for ASG %s deletion", asgName) + case <-ticker.C: + deleted, err := m.isASGDeleted(asgName) + if err != nil { + return fmt.Errorf("failed to check ASG deletion: %w", err) + } else if deleted { + return nil + } + } + } +} + +func (m *nodeManager) isASGDeleted(asgName string) (bool, error) { + asgOutput, err := m.clients.ASG().DescribeAutoScalingGroups(context.TODO(), &autoscaling.DescribeAutoScalingGroupsInput{ + AutoScalingGroupNames: []string{asgName}, + }) + if err != nil { + return false, fmt.Errorf("failed to describe ASG: %w", err) + } else if len(asgOutput.AutoScalingGroups) == 0 { + return true, nil + } + return false, nil +} + +func (m *nodeManager) cleanupEFASecurityGroupENIs(efaSecurityGroupID string) error { + enis, err := m.getSecurityGroupNetworkInterfaceIds(efaSecurityGroupID) + if err != nil { + return fmt.Errorf("failed to describe ENIs: %w", err) + } + + for _, eni := range enis { + klog.Infof("deleting leaked ENI: %s", eni) + _, err := m.clients.EC2().DeleteNetworkInterface(context.TODO(), &ec2.DeleteNetworkInterfaceInput{ + NetworkInterfaceId: aws.String(eni), + }) + if err != nil { + return fmt.Errorf("failed to delete leaked ENI: %w", err) + } + } + klog.Infof("deleted %d leaked ENI(s) attached to EFA security group!", len(enis)) + return nil +} + +func (m *nodeManager) getSecurityGroupNetworkInterfaceIds(efaSecurityGroupID string) ([]string, error) { + output, err := m.clients.EC2().DescribeNetworkInterfaces(context.TODO(), &ec2.DescribeNetworkInterfacesInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("group-id"), + Values: []string{efaSecurityGroupID}, + }, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe ENIs: %w", err) + } + + var enis []string + for _, eni := range output.NetworkInterfaces { + enis = append(enis, *eni.NetworkInterfaceId) + } + return enis, nil +} + +func (m *nodeManager) getEFASecurityGroupIDFromStack(stackName string) (string, error) { + describeInput := cloudformation.DescribeStackResourcesInput{ + StackName: aws.String(stackName), + } + output, err := m.clients.CFN().DescribeStackResources(context.TODO(), &describeInput) + if err != nil { + return "", fmt.Errorf("failed to describe stack resources: %w", err) + } + for _, resource := range output.StackResources { + if *resource.LogicalResourceId == "EFASecurityGroup" { + return *resource.PhysicalResourceId, nil + } + } + return "", nil +} + func (m *nodeManager) getUnmanagedNodegroupStackName() string { return fmt.Sprintf("%s-unmanaged-nodegroup", m.resourceID) } diff --git a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml index 220e3a155..e0ff19d33 100644 --- a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml +++ b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup-efa.yaml @@ -64,71 +64,76 @@ Conditions: IsUserDataMIMEPart: !Equals [true, !Ref UserDataIsMIMEPart] Resources: + EFASecurityGroup: + Type: "AWS::EC2::SecurityGroup" + Properties: + GroupDescription: Security group for all nodes in the cluster + Tags: + - Key: !Sub "kubernetes.io/cluster/${ClusterName}" + Value: owned + VpcId: !Ref VpcId + EFASecurityGroupIngress: Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup Properties: Description: Allow node to communicate with each other FromPort: 0 ToPort: 65535 - GroupId: !Ref SecurityGroup + GroupId: !Ref EFASecurityGroup IpProtocol: "-1" - SourceSecurityGroupId: !Ref SecurityGroup - - EFASecurityGroupIngressControlPlane: - Type: "AWS::EC2::SecurityGroupIngress" - Properties: - Description: Allow pods to communicate with the cluster API Server - FromPort: 443 - ToPort: 443 - GroupId: !Ref SecurityGroup - IpProtocol: tcp - SourceSecurityGroupId: !Ref SecurityGroup - - EFASecurityGroupFromControlPlaneIngress: - Type: "AWS::EC2::SecurityGroupIngress" - Properties: - Description: Allow worker Kubelets and pods to receive communication from the cluster control plane - FromPort: 1025 - ToPort: 65535 - GroupId: !Ref SecurityGroup - IpProtocol: tcp - SourceSecurityGroupId: !Ref SecurityGroup + SourceSecurityGroupId: !Ref EFASecurityGroup EFASecurityGroupEgress: Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup Properties: Description: Allow the efa worker nodes outbound communication - DestinationSecurityGroupId: !Ref SecurityGroup + DestinationSecurityGroupId: !Ref EFASecurityGroup FromPort: 0 ToPort: 65536 - GroupId: !Ref SecurityGroup + GroupId: !Ref EFASecurityGroup IpProtocol: "-1" - + EFASecurityGroupEgressAllIpv4: Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup Properties: Description: Allow the efa worker nodes outbound communication FromPort: 0 ToPort: 65536 CidrIp: "0.0.0.0/0" - GroupId: !Ref SecurityGroup + GroupId: !Ref EFASecurityGroup IpProtocol: "-1" EFASecurityGroupEgressAllIpv6: Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup Properties: Description: Allow the efa worker nodes outbound communication FromPort: 0 ToPort: 65536 CidrIpv6: "::/0" - GroupId: !Ref SecurityGroup + GroupId: !Ref EFASecurityGroup IpProtocol: "-1" + EFASecurityGroupIngressControlPlane: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow pods to communicate with the cluster API Server + FromPort: 443 + ToPort: 443 + GroupId: !Ref SecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref EFASecurityGroup + EFASecurityGroupEgressControlPlane: Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup Properties: Description: Allow the cluster control plane to communicate with worker Kubelet and pods - DestinationSecurityGroupId: !Ref SecurityGroup + DestinationSecurityGroupId: !Ref EFASecurityGroup FromPort: 1025 ToPort: 65535 GroupId: !Ref SecurityGroup @@ -136,14 +141,37 @@ Resources: ControlPlaneEgressToEFASecurityGroupOn443: Type: "AWS::EC2::SecurityGroupEgress" + DependsOn: EFASecurityGroup Properties: Description: Allow the cluster control plane to communicate with pods running extension API servers on port 443 - DestinationSecurityGroupId: !Ref SecurityGroup + DestinationSecurityGroupId: !Ref EFASecurityGroup FromPort: 443 ToPort: 443 GroupId: !Ref SecurityGroup IpProtocol: tcp + EFASecurityGroupFromControlPlaneIngress: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow worker Kubelets and pods to receive communication from the cluster control plane + FromPort: 1025 + ToPort: 65535 + GroupId: !Ref EFASecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref SecurityGroup + + EFASecurityGroupFromControlPlaneOn443Ingress: + Type: "AWS::EC2::SecurityGroupIngress" + DependsOn: EFASecurityGroup + Properties: + Description: Allow pods running extension API servers on port 443 to receive communication from cluster control plane + FromPort: 443 + ToPort: 443 + GroupId: !Ref EFASecurityGroup + IpProtocol: tcp + SourceSecurityGroupId: !Ref SecurityGroup + NodeInstanceProfile: Type: AWS::IAM::InstanceProfile Properties: @@ -181,224 +209,224 @@ Resources: DeviceIndex: 0 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 1 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 2 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 3 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 4 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 5 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 6 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 7 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 8 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 9 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 10 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 11 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 12 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 13 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 14 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 15 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 16 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 17 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 18 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 19 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 20 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 21 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 22 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 23 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 24 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 25 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 26 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 27 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 28 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 29 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 30 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 31 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Fn::If: - IsP4Node @@ -408,28 +436,28 @@ Resources: DeviceIndex: 0 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 1 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 2 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 3 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Fn::If: - IsTRN1Node @@ -439,56 +467,56 @@ Resources: DeviceIndex: 0 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 1 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 2 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 3 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 4 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 5 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 6 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - Description: NetworkInterfaces Configuration For EFA and EKS NetworkCardIndex: 7 DeviceIndex: 1 InterfaceType: efa Groups: - - !Ref SecurityGroup + - !Ref EFASecurityGroup DeleteOnTermination: true - [] UserData: