diff --git a/internal/deployers/eksapi/cluster.go b/internal/deployers/eksapi/cluster.go index d200e70f8..c16efabe3 100644 --- a/internal/deployers/eksapi/cluster.go +++ b/internal/deployers/eksapi/cluster.go @@ -13,11 +13,6 @@ import ( "k8s.io/klog/v2" ) -const ( - clusterCreationTimeout = time.Minute * 15 - clusterDeletionTimeout = time.Minute * 15 -) - type ClusterManager struct { clients *awsClients resourceID string @@ -93,18 +88,18 @@ func (m *ClusterManager) getOrCreateCluster(infra *Infrastructure, opts *deploye } else { klog.Infof("reusing existing static cluster %s", opts.StaticClusterName) } - cluster, waitErr := m.waitForClusterActive(targetClusterName) + cluster, waitErr := m.waitForClusterActive(targetClusterName, opts.ClusterCreationTimeout) if waitErr != nil { return nil, fmt.Errorf("failed to wait for cluster to become active: %v", waitErr) } return cluster, nil } -func (m *ClusterManager) waitForClusterActive(clusterName string) (*Cluster, error) { +func (m *ClusterManager) waitForClusterActive(clusterName string, timeout time.Duration) (*Cluster, error) { klog.Infof("waiting for cluster to be active: %s", clusterName) out, err := eks.NewClusterActiveWaiter(m.clients.EKS()).WaitForOutput(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(clusterName), - }, clusterCreationTimeout) + }, timeout) // log when possible, whether there was an error or not if out != nil { klog.Infof("cluster details: %+v", out.Cluster) @@ -167,8 +162,7 @@ func (m *ClusterManager) deleteCluster() error { err = eks.NewClusterDeletedWaiter(m.clients.EKS()). Wait(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(m.resourceID), - }, - clusterDeletionTimeout) + }, time.Minute*15) // TODO: make this configurable? it's more complicated than the creation timeout, since this func may be called by the janitor if err != nil { return fmt.Errorf("failed to wait for cluster to be deleted: %v", err) } diff --git a/internal/deployers/eksapi/deployer.go b/internal/deployers/eksapi/deployer.go index f67a8d20f..4ebc5685d 100644 --- a/internal/deployers/eksapi/deployer.go +++ b/internal/deployers/eksapi/deployer.go @@ -55,16 +55,17 @@ type deployer struct { } type deployerOptions struct { - Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."` - AMI string `flag:"ami" desc:"AMI for unmanaged nodes"` - AMIType string `flag:"ami-type" desc:"AMI type for managed nodes"` - AutoMode bool `flag:"auto-mode" desc:"Enable EKS Auto Mode"` - CapacityReservation bool `flag:"capacity-reservation" desc:"Use capacity reservation for the unmanaged nodegroup"` - ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"` - EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."` - EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"` - EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"` - ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."` + Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."` + AMI string `flag:"ami" desc:"AMI for unmanaged nodes"` + AMIType string `flag:"ami-type" desc:"AMI type for managed nodes"` + AutoMode bool `flag:"auto-mode" desc:"Enable EKS Auto Mode"` + CapacityReservation bool `flag:"capacity-reservation" desc:"Use capacity reservation for the unmanaged nodegroup"` + ClusterCreationTimeout time.Duration `flag:"cluster-creation-timeout" desc:"Time to wait for cluster to be created and become active."` + ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"` + EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."` + EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"` + EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"` + ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."` // TODO: remove this once it's no longer used in downstream jobs GenerateSSHKey bool `flag:"generate-ssh-key" desc:"Generate an SSH key to use for tests. The generated key should not be used in production, as it will not have a passphrase."` InstanceTypes []string `flag:"instance-types" desc:"Node instance types. Cannot be used with --instance-type-archs"` @@ -250,6 +251,9 @@ func (d *deployer) verifyUpFlags() error { d.IPFamily = string(ekstypes.IpFamilyIpv4) klog.Infof("Using default IP family: %s", d.IPFamily) } + if d.ClusterCreationTimeout == 0 { + d.ClusterCreationTimeout = time.Minute * 15 + } if d.NodeCreationTimeout == 0 { d.NodeCreationTimeout = time.Minute * 20 } @@ -324,7 +328,7 @@ func (d *deployer) Down() error { return deleteResources(d.infraManager, d.clusterManager, d.nodeManager, d.k8sClient, &d.deployerOptions) } -func deleteResources(im *InfrastructureManager, cm *ClusterManager, nm *nodeManager /* nillable */, k8sClient *k8sClient, opts *deployerOptions) error { +func deleteResources(im *InfrastructureManager, cm *ClusterManager, nm *nodeManager, k8sClient *k8sClient /* nillable */, opts *deployerOptions /* nillable */) error { if err := nm.deleteNodes(k8sClient, opts); err != nil { return err } diff --git a/internal/deployers/eksapi/janitor.go b/internal/deployers/eksapi/janitor.go index 4b23bc349..382c26613 100644 --- a/internal/deployers/eksapi/janitor.go +++ b/internal/deployers/eksapi/janitor.go @@ -114,7 +114,7 @@ func (j *janitor) sweepWorker(wg *sync.WaitGroup, stackQueue <-chan cloudformati clusterManager := NewClusterManager(clients, resourceID) nodeManager := NewNodeManager(clients, resourceID) klog.Infof("deleting resources (%v old): %s", resourceAge, resourceID) - if err := deleteResources(infraManager, clusterManager, nodeManager /* TODO: pass a k8sClient */, nil, nil); err != nil { + if err := deleteResources(infraManager, clusterManager, nodeManager, nil /* k8sClient */, nil /* deployerOptions */); err != nil { errChan <- fmt.Errorf("failed to delete resources: %s: %v", resourceID, err) } }