Skip to content

Commit

Permalink
Capture InstanceTypeFilterErrors in error structs and don't print the…
Browse files Browse the repository at this point in the history
…m until the end
  • Loading branch information
jonathan-innis committed Jan 31, 2025
1 parent 1e21a7d commit ecce7df
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 67 deletions.
128 changes: 62 additions & 66 deletions pkg/controllers/provisioning/scheduling/nodeclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,14 @@ func (n *NodeClaim) Add(pod *v1.Pod, podRequests v1.ResourceList) error {
// Check instance type combinations
requests := resources.Merge(n.Spec.Resources.Requests, podRequests)

filtered := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeClaimRequirements, requests)

if len(filtered.remaining) == 0 {
// log the total resources being requested (daemonset + the pod)
cumulativeResources := resources.Merge(n.daemonResources, podRequests)
return fmt.Errorf("no instance type satisfied resources %s and requirements %s (%s)", resources.String(cumulativeResources), nodeClaimRequirements, filtered.FailureReason())
remaining, err := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeClaimRequirements, podRequests, n.daemonResources, requests)
if err != nil {
return fmt.Errorf("incompatible instance types, %w", err)
}

// Update node
n.Pods = append(n.Pods, pod)
n.InstanceTypeOptions = filtered.remaining
n.InstanceTypeOptions = remaining
n.Spec.Resources.Requests = requests
n.Requirements = nodeClaimRequirements
n.topology.Record(pod, nodeClaimRequirements, scheduling.AllowUndefinedWellKnownLabels)
Expand Down Expand Up @@ -159,8 +156,7 @@ func InstanceTypeList(instanceTypeOptions []*cloudprovider.InstanceType) string
return itSb.String()
}

type filterResults struct {
remaining cloudprovider.InstanceTypes
type InstanceTypeFilterError struct {
// Each of these three flags indicates if that particular criteria was met by at least one instance type
requirementsMet bool
fits bool
Expand All @@ -173,123 +169,123 @@ type filterResults struct {
// fitsAndOffering indicates if a single instance type had enough resources and was a required offering
fitsAndOffering bool
minValuesIncompatibleErr error
requests v1.ResourceList

// We capture requirements so that we can know what the requirements were when evaluating instance type compatibility
requirements scheduling.Requirements
// We capture podRequests here since when a pod can't schedule due to requests, it's because the pod
// was on its own on the simulated Node and exceeded the available resources for any instance type for this NodePool
podRequests v1.ResourceList
// We capture daemonRequests since this contributes to the resources that are required to schedule to this NodePool
daemonRequests v1.ResourceList
}

// FailureReason returns a presentable string explaining why all instance types were filtered out
//
//nolint:gocyclo
func (r filterResults) FailureReason() string {
if len(r.remaining) > 0 {
return ""
}

func (e InstanceTypeFilterError) Error() string {
// minValues is specified in the requirements and is not met
if r.minValuesIncompatibleErr != nil {
return r.minValuesIncompatibleErr.Error()
if e.minValuesIncompatibleErr != nil {
return fmt.Sprintf("%s, requirements=%s, resources=%s", e.minValuesIncompatibleErr.Error(), e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// no instance type met any of the three criteria, meaning each criteria was enough to completely prevent
// this pod from scheduling
if !r.requirementsMet && !r.fits && !r.hasOffering {
return "no instance type met the scheduling requirements or had enough resources or had a required offering"
if !e.requirementsMet && !e.fits && !e.hasOffering {
return fmt.Sprintf("no instance type met the scheduling requirements or had enough resources or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// check the other pairwise criteria
if !r.requirementsMet && !r.fits {
return "no instance type met the scheduling requirements or had enough resources"
if !e.requirementsMet && !e.fits {
return fmt.Sprintf("no instance type met the scheduling requirements or had enough resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.requirementsMet && !r.hasOffering {
return "no instance type met the scheduling requirements or had a required offering"
if !e.requirementsMet && !e.hasOffering {
return fmt.Sprintf("no instance type met the scheduling requirements or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.fits && !r.hasOffering {
return "no instance type had enough resources or had a required offering"
if !e.fits && !e.hasOffering {
return fmt.Sprintf("no instance type had enough resources or had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// and then each individual criteria. These are sort of the same as above in that each one indicates that no
// instance type matched that criteria at all, so it was enough to exclude all instance types. I think it's
// helpful to have these separate, since we can report the multiple excluding criteria above.
if !r.requirementsMet {
return "no instance type met all requirements"
if !e.requirementsMet {
return fmt.Sprintf("no instance type met all requirements, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

if !r.fits {
msg := "no instance type has enough resources"
if !e.fits {
msg := fmt.Sprintf("no instance type has enough resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
// special case for a user typo I saw reported once
if r.requests.Cpu().Cmp(resource.MustParse("1M")) >= 0 {
if e.podRequests.Cpu().Cmp(resource.MustParse("1M")) >= 0 {
msg += " (CPU request >= 1 Million, m vs M typo?)"
}
return msg
}

if !r.hasOffering {
return "no instance type has the required offering"
if !e.hasOffering {
return fmt.Sprintf("no instance type has the required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// see if any pair of criteria was enough to exclude all instances
if r.requirementsAndFits {
return "no instance type which met the scheduling requirements and had enough resources, had a required offering"
if e.requirementsAndFits {
return fmt.Sprintf("no instance type which met the scheduling requirements and had enough resources, had a required offering, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}
if r.fitsAndOffering {
return "no instance type which had enough resources and the required offering met the scheduling requirements"
if e.fitsAndOffering {
return fmt.Sprintf("no instance type which had enough resources and the required offering met the scheduling requirements, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}
if r.requirementsAndOffering {
return "no instance type which met the scheduling requirements and the required offering had the required resources"
if e.requirementsAndOffering {
return fmt.Sprintf("no instance type which met the scheduling requirements and the required offering had the required resources, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

// finally all instances were filtered out, but we had at least one instance that met each criteria, and met each
// pairwise set of criteria, so the only thing that remains is no instance which met all three criteria simultaneously
return "no instance type met the requirements/resources/offering tuple"
return fmt.Sprintf("no instance type met the requirements/resources/offering tuple, requirements=%s, resources=%s", e.requirements, resources.String(resources.Merge(e.daemonRequests, e.podRequests)))
}

//nolint:gocyclo
func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceType, requirements scheduling.Requirements, requests v1.ResourceList) filterResults {
results := filterResults{
requests: requests,
func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceType, requirements scheduling.Requirements, podRequests, daemonRequests, totalRequests v1.ResourceList) (cloudprovider.InstanceTypes, error) {
// We hold the results of our scheduling simulation inside of this InstanceTypeFilterError struct
// to reduce the CPU load of having to generate the error string for a failed scheduling simulation
err := InstanceTypeFilterError{
requirementsMet: false,
fits: false,
hasOffering: false,

requirementsAndFits: false,
requirementsAndOffering: false,
fitsAndOffering: false,

podRequests: podRequests,
daemonRequests: daemonRequests,
}
remaining := cloudprovider.InstanceTypes{}

for _, it := range instanceTypes {
// the tradeoff to not short circuiting on the filtering is that we can report much better error messages
// the tradeoff to not short-circuiting on the filtering is that we can report much better error messages
// about why scheduling failed
itCompat := compatible(it, requirements)
itFits := fits(it, requests)
itFits := fits(it, totalRequests)
itHasOffering := it.Offerings.Available().HasCompatible(requirements)

// track if any single instance type met a single criteria
results.requirementsMet = results.requirementsMet || itCompat
results.fits = results.fits || itFits
results.hasOffering = results.hasOffering || itHasOffering
err.requirementsMet = err.requirementsMet || itCompat
err.fits = err.fits || itFits
err.hasOffering = err.hasOffering || itHasOffering

// track if any single instance type met the three pairs of criteria
results.requirementsAndFits = results.requirementsAndFits || (itCompat && itFits && !itHasOffering)
results.requirementsAndOffering = results.requirementsAndOffering || (itCompat && itHasOffering && !itFits)
results.fitsAndOffering = results.fitsAndOffering || (itFits && itHasOffering && !itCompat)
err.requirementsAndFits = err.requirementsAndFits || (itCompat && itFits && !itHasOffering)
err.requirementsAndOffering = err.requirementsAndOffering || (itCompat && itHasOffering && !itFits)
err.fitsAndOffering = err.fitsAndOffering || (itFits && itHasOffering && !itCompat)

// and if it met all criteria, we keep the instance type and continue filtering. We now won't be reporting
// any errors.
if itCompat && itFits && itHasOffering {
results.remaining = append(results.remaining, it)
remaining = append(remaining, it)
}
}

if requirements.HasMinValues() {
// We don't care about the minimum number of instance types that meet our requirements here, we only care if they meet our requirements.
_, results.minValuesIncompatibleErr = results.remaining.SatisfiesMinValues(requirements)
if results.minValuesIncompatibleErr != nil {
_, err.minValuesIncompatibleErr = remaining.SatisfiesMinValues(requirements)
if err.minValuesIncompatibleErr != nil {
// If minValues is NOT met for any of the requirement across InstanceTypes, then return empty InstanceTypeOptions as we cannot launch with the remaining InstanceTypes.
results.remaining = nil
remaining = nil
}
}
return results
if len(remaining) == 0 {
return nil, err
}
return remaining, nil
}

func compatible(instanceType *cloudprovider.InstanceType, requirements scheduling.Requirements) bool {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/provisioning/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func NewScheduler(ctx context.Context, kubeClient client.Client, nodePools []*v1
// Pre-filter instance types eligible for NodePools to reduce work done during scheduling loops for pods
templates := lo.FilterMap(nodePools, func(np *v1.NodePool, _ int) (*NodeClaimTemplate, bool) {
nct := NewNodeClaimTemplate(np)
nct.InstanceTypeOptions = filterInstanceTypesByRequirements(instanceTypes[np.Name], nct.Requirements, corev1.ResourceList{}).remaining
nct.InstanceTypeOptions, _ = filterInstanceTypesByRequirements(instanceTypes[np.Name], nct.Requirements, corev1.ResourceList{}, corev1.ResourceList{}, corev1.ResourceList{})
if len(nct.InstanceTypeOptions) == 0 {
recorder.Publish(NoCompatibleInstanceTypes(np))
log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Info("skipping, nodepool requirements filtered out all instance types")
Expand Down

0 comments on commit ecce7df

Please sign in to comment.