Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --cluster-creation-timeout flag #582

Merged
merged 1 commit into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions internal/deployers/eksapi/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ import (
"k8s.io/klog/v2"
)

const (
clusterCreationTimeout = time.Minute * 15
clusterDeletionTimeout = time.Minute * 15
)

type ClusterManager struct {
clients *awsClients
resourceID string
Expand Down Expand Up @@ -93,18 +88,18 @@ func (m *ClusterManager) getOrCreateCluster(infra *Infrastructure, opts *deploye
} else {
klog.Infof("reusing existing static cluster %s", opts.StaticClusterName)
}
cluster, waitErr := m.waitForClusterActive(targetClusterName)
cluster, waitErr := m.waitForClusterActive(targetClusterName, opts.ClusterCreationTimeout)
if waitErr != nil {
return nil, fmt.Errorf("failed to wait for cluster to become active: %v", waitErr)
}
return cluster, nil
}

func (m *ClusterManager) waitForClusterActive(clusterName string) (*Cluster, error) {
func (m *ClusterManager) waitForClusterActive(clusterName string, timeout time.Duration) (*Cluster, error) {
klog.Infof("waiting for cluster to be active: %s", clusterName)
out, err := eks.NewClusterActiveWaiter(m.clients.EKS()).WaitForOutput(context.TODO(), &eks.DescribeClusterInput{
Name: aws.String(clusterName),
}, clusterCreationTimeout)
}, timeout)
// log when possible, whether there was an error or not
if out != nil {
klog.Infof("cluster details: %+v", out.Cluster)
Expand Down Expand Up @@ -167,8 +162,7 @@ func (m *ClusterManager) deleteCluster() error {
err = eks.NewClusterDeletedWaiter(m.clients.EKS()).
Wait(context.TODO(), &eks.DescribeClusterInput{
Name: aws.String(m.resourceID),
},
clusterDeletionTimeout)
}, time.Minute*15) // TODO: make this configurable? it's more complicated than the creation timeout, since this func may be called by the janitor
if err != nil {
return fmt.Errorf("failed to wait for cluster to be deleted: %v", err)
}
Expand Down
26 changes: 15 additions & 11 deletions internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,17 @@ type deployer struct {
}

type deployerOptions struct {
Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."`
AMI string `flag:"ami" desc:"AMI for unmanaged nodes"`
AMIType string `flag:"ami-type" desc:"AMI type for managed nodes"`
AutoMode bool `flag:"auto-mode" desc:"Enable EKS Auto Mode"`
CapacityReservation bool `flag:"capacity-reservation" desc:"Use capacity reservation for the unmanaged nodegroup"`
ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"`
EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."`
EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"`
EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"`
ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."`
Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."`
AMI string `flag:"ami" desc:"AMI for unmanaged nodes"`
AMIType string `flag:"ami-type" desc:"AMI type for managed nodes"`
AutoMode bool `flag:"auto-mode" desc:"Enable EKS Auto Mode"`
CapacityReservation bool `flag:"capacity-reservation" desc:"Use capacity reservation for the unmanaged nodegroup"`
ClusterCreationTimeout time.Duration `flag:"cluster-creation-timeout" desc:"Time to wait for cluster to be created and become active."`
ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"`
EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. Requires --unmanaged-nodes."`
EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"`
EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"`
ExpectedAMI string `flag:"expected-ami" desc:"Expected AMI of nodes. Up will fail if the actual nodes are not utilizing the expected AMI. Defaults to --ami if defined."`
// TODO: remove this once it's no longer used in downstream jobs
GenerateSSHKey bool `flag:"generate-ssh-key" desc:"Generate an SSH key to use for tests. The generated key should not be used in production, as it will not have a passphrase."`
InstanceTypes []string `flag:"instance-types" desc:"Node instance types. Cannot be used with --instance-type-archs"`
Expand Down Expand Up @@ -250,6 +251,9 @@ func (d *deployer) verifyUpFlags() error {
d.IPFamily = string(ekstypes.IpFamilyIpv4)
klog.Infof("Using default IP family: %s", d.IPFamily)
}
if d.ClusterCreationTimeout == 0 {
d.ClusterCreationTimeout = time.Minute * 15
}
if d.NodeCreationTimeout == 0 {
d.NodeCreationTimeout = time.Minute * 20
}
Expand Down Expand Up @@ -324,7 +328,7 @@ func (d *deployer) Down() error {
return deleteResources(d.infraManager, d.clusterManager, d.nodeManager, d.k8sClient, &d.deployerOptions)
}

func deleteResources(im *InfrastructureManager, cm *ClusterManager, nm *nodeManager /* nillable */, k8sClient *k8sClient, opts *deployerOptions) error {
func deleteResources(im *InfrastructureManager, cm *ClusterManager, nm *nodeManager, k8sClient *k8sClient /* nillable */, opts *deployerOptions /* nillable */) error {
if err := nm.deleteNodes(k8sClient, opts); err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion internal/deployers/eksapi/janitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func (j *janitor) sweepWorker(wg *sync.WaitGroup, stackQueue <-chan cloudformati
clusterManager := NewClusterManager(clients, resourceID)
nodeManager := NewNodeManager(clients, resourceID)
klog.Infof("deleting resources (%v old): %s", resourceAge, resourceID)
if err := deleteResources(infraManager, clusterManager, nodeManager /* TODO: pass a k8sClient */, nil, nil); err != nil {
if err := deleteResources(infraManager, clusterManager, nodeManager, nil /* k8sClient */, nil /* deployerOptions */); err != nil {
errChan <- fmt.Errorf("failed to delete resources: %s: %v", resourceID, err)
}
}
Expand Down