Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Capture InstanceTypeFilterErrors in error structs and don't print them until the end #1948

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 64 additions & 66 deletions pkg/controllers/provisioning/scheduling/nodeclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,16 @@ func (n *NodeClaim) Add(pod *v1.Pod, podRequests v1.ResourceList) error {
// Check instance type combinations
requests := resources.Merge(n.Spec.Resources.Requests, podRequests)

filtered := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeClaimRequirements, requests)

if len(filtered.remaining) == 0 {
// log the total resources being requested (daemonset + the pod)
cumulativeResources := resources.Merge(n.daemonResources, podRequests)
return fmt.Errorf("no instance type satisfied resources %s and requirements %s (%s)", resources.String(cumulativeResources), nodeClaimRequirements, filtered.FailureReason())
remaining, err := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeClaimRequirements, podRequests, n.daemonResources, requests)
if err != nil {
// We avoid wrapping this err because calling String() on InstanceTypeFilterError is an expensive operation
// due to calls to resources.Merge and stringifying the nodeClaimRequirements
return err
}

// Update node
n.Pods = append(n.Pods, pod)
n.InstanceTypeOptions = filtered.remaining
n.InstanceTypeOptions = remaining
n.Spec.Resources.Requests = requests
n.Requirements = nodeClaimRequirements
n.topology.Record(pod, nodeClaimRequirements, scheduling.AllowUndefinedWellKnownLabels)
Expand Down Expand Up @@ -159,8 +158,7 @@ func InstanceTypeList(instanceTypeOptions []*cloudprovider.InstanceType) string
return itSb.String()
}

type filterResults struct {
remaining cloudprovider.InstanceTypes
type InstanceTypeFilterError struct {
// Each of these three flags indicates if that particular criteria was met by at least one instance type
requirementsMet bool
fits bool
Expand All @@ -173,95 +171,92 @@ type filterResults struct {
// fitsAndOffering indicates if a single instance type had enough resources and was a required offering
fitsAndOffering bool
minValuesIncompatibleErr error
requests v1.ResourceList

// We capture requirements so that we can know what the requirements were when evaluating instance type compatibility
requirements scheduling.Requirements
// We capture podRequests here since when a pod can't schedule due to requests, it's because the pod
// was on its own on the simulated Node and exceeded the available resources for any instance type for this NodePool
podRequests v1.ResourceList
// We capture daemonRequests since this contributes to the resources that are required to schedule to this NodePool
daemonRequests v1.ResourceList
}

// FailureReason returns a presentable string explaining why all instance types were filtered out
//
//nolint:gocyclo
func (r filterResults) FailureReason() string {
if len(r.remaining) > 0 {
return ""
}

func (e InstanceTypeFilterError) Error() string {
// minValues is specified in the requirements and is not met
if r.minValuesIncompatibleErr != nil {
return r.minValuesIncompatibleErr.Error()
if e.minValuesIncompatibleErr != nil {
return fmt.Sprintf("%s, requirements=%s, resources=%s", e.minValuesIncompatibleErr.Error(), e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// no instance type met any of the three criteria, meaning each criteria was enough to completely prevent
// this pod from scheduling
if !r.requirementsMet && !r.fits && !r.hasOffering {
return "no instance type met the scheduling requirements or had enough resources or had a required offering"
if !e.requirementsMet && !e.fits && !e.hasOffering {
return fmt.Sprintf("no instance type met the scheduling requirements or had enough resources or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the improvement require that e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests) be called in every branch as opposed to once at the top of .Error()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it doesn't -- I was going back and forth on whether to add the code duplication or not -- it seemed fine to me to do it this way for the short-circuiting and error returns -- otherwise, I have to create a string and then append to it

}

// check the other pairwise criteria
if !r.requirementsMet && !r.fits {
return "no instance type met the scheduling requirements or had enough resources"
if !e.requirementsMet && !e.fits {
return fmt.Sprintf("no instance type met the scheduling requirements or had enough resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.requirementsMet && !r.hasOffering {
return "no instance type met the scheduling requirements or had a required offering"
if !e.requirementsMet && !e.hasOffering {
return fmt.Sprintf("no instance type met the scheduling requirements or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.fits && !r.hasOffering {
return "no instance type had enough resources or had a required offering"
if !e.fits && !e.hasOffering {
return fmt.Sprintf("no instance type had enough resources or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// and then each individual criteria. These are sort of the same as above in that each one indicates that no
// instance type matched that criteria at all, so it was enough to exclude all instance types. I think it's
// helpful to have these separate, since we can report the multiple excluding criteria above.
if !r.requirementsMet {
return "no instance type met all requirements"
if !e.requirementsMet {
return fmt.Sprintf("no instance type met all requirements, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.fits {
msg := "no instance type has enough resources"
if !e.fits {
msg := fmt.Sprintf("no instance type has enough resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
// special case for a user typo I saw reported once
if r.requests.Cpu().Cmp(resource.MustParse("1M")) >= 0 {
if e.podRequests.Cpu().Cmp(resource.MustParse("1M")) >= 0 {
msg += " (CPU request >= 1 Million, m vs M typo?)"
}
return msg
}

if !r.hasOffering {
return "no instance type has the required offering"
if !e.hasOffering {
return fmt.Sprintf("no instance type has the required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// see if any pair of criteria was enough to exclude all instances
if r.requirementsAndFits {
return "no instance type which met the scheduling requirements and had enough resources, had a required offering"
if e.requirementsAndFits {
return fmt.Sprintf("no instance type which met the scheduling requirements and had enough resources, had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}
if r.fitsAndOffering {
return "no instance type which had enough resources and the required offering met the scheduling requirements"
if e.fitsAndOffering {
return fmt.Sprintf("no instance type which had enough resources and the required offering met the scheduling requirements, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}
if r.requirementsAndOffering {
return "no instance type which met the scheduling requirements and the required offering had the required resources"
if e.requirementsAndOffering {
return fmt.Sprintf("no instance type which met the scheduling requirements and the required offering had the required resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// finally all instances were filtered out, but we had at least one instance that met each criteria, and met each
// pairwise set of criteria, so the only thing that remains is no instance which met all three criteria simultaneously
return "no instance type met the requirements/resources/offering tuple"
return fmt.Sprintf("no instance type met the requirements/resources/offering tuple, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

//nolint:gocyclo
func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceType, requirements scheduling.Requirements, requests v1.ResourceList) filterResults {
results := filterResults{
requests: requests,
func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceType, requirements scheduling.Requirements, podRequests, daemonRequests, totalRequests v1.ResourceList) (cloudprovider.InstanceTypes, error) {
// We hold the results of our scheduling simulation inside of this InstanceTypeFilterError struct
// to reduce the CPU load of having to generate the error string for a failed scheduling simulation
err := InstanceTypeFilterError{
requirementsMet: false,
fits: false,
hasOffering: false,

requirementsAndFits: false,
requirementsAndOffering: false,
fitsAndOffering: false,

podRequests: podRequests,
daemonRequests: daemonRequests,
}
remaining := cloudprovider.InstanceTypes{}

for _, it := range instanceTypes {
// the tradeoff to not short circuiting on the filtering is that we can report much better error messages
// the tradeoff to not short-circuiting on the filtering is that we can report much better error messages
// about why scheduling failed
itCompat := compatible(it, requirements)
itFits := fits(it, requests)
itFits := fits(it, totalRequests)

// By using this iterative approach vs. the Available() function it prevents allocations
// which have to be garbage collected and slow down Karpenter's scheduling algorithm
Expand All @@ -274,31 +269,34 @@ func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceTy
}

// track if any single instance type met a single criteria
results.requirementsMet = results.requirementsMet || itCompat
results.fits = results.fits || itFits
results.hasOffering = results.hasOffering || itHasOffering
err.requirementsMet = err.requirementsMet || itCompat
err.fits = err.fits || itFits
err.hasOffering = err.hasOffering || itHasOffering

// track if any single instance type met the three pairs of criteria
results.requirementsAndFits = results.requirementsAndFits || (itCompat && itFits && !itHasOffering)
results.requirementsAndOffering = results.requirementsAndOffering || (itCompat && itHasOffering && !itFits)
results.fitsAndOffering = results.fitsAndOffering || (itFits && itHasOffering && !itCompat)
err.requirementsAndFits = err.requirementsAndFits || (itCompat && itFits && !itHasOffering)
err.requirementsAndOffering = err.requirementsAndOffering || (itCompat && itHasOffering && !itFits)
err.fitsAndOffering = err.fitsAndOffering || (itFits && itHasOffering && !itCompat)

// and if it met all criteria, we keep the instance type and continue filtering. We now won't be reporting
// any errors.
if itCompat && itFits && itHasOffering {
results.remaining = append(results.remaining, it)
remaining = append(remaining, it)
}
}

if requirements.HasMinValues() {
// We don't care about the minimum number of instance types that meet our requirements here, we only care if they meet our requirements.
_, results.minValuesIncompatibleErr = results.remaining.SatisfiesMinValues(requirements)
if results.minValuesIncompatibleErr != nil {
_, err.minValuesIncompatibleErr = remaining.SatisfiesMinValues(requirements)
if err.minValuesIncompatibleErr != nil {
// If minValues is NOT met for any of the requirement across InstanceTypes, then return empty InstanceTypeOptions as we cannot launch with the remaining InstanceTypes.
results.remaining = nil
remaining = nil
}
}
return results
if len(remaining) == 0 {
return nil, err
}
return remaining, nil
}

func compatible(instanceType *cloudprovider.InstanceType, requirements scheduling.Requirements) bool {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/provisioning/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func NewScheduler(ctx context.Context, kubeClient client.Client, nodePools []*v1
// Pre-filter instance types eligible for NodePools to reduce work done during scheduling loops for pods
templates := lo.FilterMap(nodePools, func(np *v1.NodePool, _ int) (*NodeClaimTemplate, bool) {
nct := NewNodeClaimTemplate(np)
nct.InstanceTypeOptions = filterInstanceTypesByRequirements(instanceTypes[np.Name], nct.Requirements, corev1.ResourceList{}).remaining
nct.InstanceTypeOptions, _ = filterInstanceTypesByRequirements(instanceTypes[np.Name], nct.Requirements, corev1.ResourceList{}, corev1.ResourceList{}, corev1.ResourceList{})
if len(nct.InstanceTypeOptions) == 0 {
recorder.Publish(NoCompatibleInstanceTypes(np))
log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Info("skipping, nodepool requirements filtered out all instance types")
Expand Down