Remove redundant SheddableCapacityFilter. (#910)

LukeAVanDrie · web-flow · commit 8e112db24f23 · 2025-06-04T12:34:43.000-07:00
Admission control/capacity management is now handled in `requestcontrol.Director.PreDispatch` (and soon to be absorbed into the new Flow Controller). This should no longer be a responsibility of the scheduling framework and this check is already being applied in #805 prior to the scheduling layer being invoked. This is not a no-op change. Previously, the `SheddableCapacityFilter`, in addition to dropping sheddable requests when at capacity, would also strictly filter the pods that the rest of the scheduling plugins would consider as input. This change removes that strict filtering so all pods are now considered so long as the system is not considered saturated. This means sheddable requests now follow the same scheduling path as critical requests provided they are not dropped by the saturation detection check in `PreDispatch`.
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -44,7 +44,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
 	profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
@@ -209,7 +208,6 @@ func run() error {
 		kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
 
 		schedulerProfile := framework.NewSchedulerProfile().
-			WithFilters(filter.NewSheddableCapacityFilter()).
 			WithScorers(framework.NewWeightedScorer(&scorer.QueueScorer{}, queueScorerWeight),
 				framework.NewWeightedScorer(&scorer.KVCacheScorer{}, kvCacheScorerWeight)).
 			WithPicker(picker.NewMaxScorePicker())
diff --git a/pkg/epp/scheduling/framework/plugins/filter/filter_test.go b/pkg/epp/scheduling/framework/plugins/filter/filter_test.go
@@ -134,42 +134,6 @@ func TestFilter(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:   "SheddableCapacityFilter, sheddable request",
-			req:    &types.LLMRequest{Critical: false},
-			filter: &SheddableCapacityFilter{queueThreshold: 0, kvCacheThreshold: 0.8},
-			input: []types.Pod{
-				&types.PodMetrics{
-					// This pod should be returned.
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0,
-					},
-				},
-				&types.PodMetrics{
-					// Queue is non zero, despite low kv cache, should not return.
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    1,
-						KVCacheUsagePercent: 0.3,
-					},
-				},
-				&types.PodMetrics{
-					// High kv cache despite zero queue, should not return
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 1.0,
-					},
-				},
-			},
-			output: []types.Pod{
-				&types.PodMetrics{
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0,
-					},
-				},
-			},
-		},
 	}
 
 	for _, test := range tests {
@@ -241,7 +205,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	// initialize LoraAffinityFilter
 	LoraAffinityFilter := NewLoraAffinityFilter()
 
-	for i := 0; i < numIterations; i++ {
+	for range numIterations {
 		result := LoraAffinityFilter.Filter(context.Background(), req, types.NewCycleState(), pods)
 
 		// Check which type of pod was returned
diff --git a/pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go b/pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go
diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
@@ -65,7 +65,7 @@ func NewScheduler(datastore Datastore) *Scheduler {
 	}
 
 	defaultProfile := framework.NewSchedulerProfile().
-		WithFilters(filter.NewSheddableCapacityFilter(), lowLatencyFilter).
+		WithFilters(lowLatencyFilter).
 		WithPicker(&picker.RandomPicker{})
 
 	profilePicker := profilepicker.NewAllProfilesPicker()
diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go
@@ -49,7 +49,7 @@ func TestSchedule(t *testing.T) {
 			err:     true,
 		},
 		{
-			name: "critical request",
+			name: "finds optimal pod",
 			req: &types.LLMRequest{
 				TargetModel: "critical",
 				RequestId:   uuid.NewString(),
@@ -114,120 +114,6 @@ func TestSchedule(t *testing.T) {
 				},
 			},
 		},
-		{
-			name: "sheddable request, accepted",
-			req: &types.LLMRequest{
-				TargetModel: "sheddable",
-				RequestId:   uuid.NewString(),
-				Critical:    false,
-			},
-			// pod1 will be picked because it has capacity for the sheddable request.
-			input: []*backendmetrics.FakePodMetrics{
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0.2,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-							"bar": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    3,
-						KVCacheUsagePercent: 0.1,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo":      1,
-							"critical": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.2,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-						},
-					},
-				},
-			},
-			wantRes: map[string]*types.Result{
-				"default": {
-					TargetPod: &types.ScoredPod{
-						Pod: &types.PodMetrics{
-							Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, Labels: make(map[string]string)},
-							MetricsState: &backendmetrics.MetricsState{
-								WaitingQueueSize:    0,
-								KVCacheUsagePercent: 0.2,
-								MaxActiveModels:     2,
-								ActiveModels: map[string]int{
-									"foo": 1,
-									"bar": 1,
-								},
-								WaitingModels: map[string]int{},
-							},
-						},
-					},
-				},
-			},
-		},
-		{
-			name: "sheddable request, dropped",
-			req: &types.LLMRequest{
-				TargetModel: "sheddable",
-				RequestId:   uuid.NewString(),
-				Critical:    false,
-			},
-			// All pods have higher KV cache thant the threshold, so the sheddable request will be
-			// dropped.
-			input: []*backendmetrics.FakePodMetrics{
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.9,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-							"bar": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    3,
-						KVCacheUsagePercent: 0.85,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo":      1,
-							"critical": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.85,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-						},
-					},
-				},
-			},
-			wantRes: nil,
-			err:     true,
-		},
 	}
 
 	for _, test := range tests {
diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go
@@ -293,9 +293,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 		{
 			name:     "noncritical, but one server has capacity, do not shed",
 			requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", modelSheddable),
-			// pod 0: selected
-			// pod 1: excluded; above KV cache threshold
-			// pod 2: excluded; above queue size threshold
+			// Pod 1 will be picked because it has relatively low queue size and low KV cache.
 			pods: newPodStates(
 				podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
 				podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
@@ -309,7 +307,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			},
 			wantErr: false,
 			wantResponses: integrationutils.NewRequestBufferedResponse(
-				"192.168.1.1:8000",
+				"192.168.1.2:8000",
 				fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test5","temperature":0}`, modelSheddableTarget),
 				&configPb.HeaderValueOption{
 					Header: &configPb.HeaderValue{
@@ -347,9 +345,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 					},
 				},
 			},
-			// pod 0: selected
-			// pod 1: excluded; above KV cache threshold
-			// pod 2: excluded; above queue size threshold
+			// Pod 1 will be picked because it has relatively low queue size and low KV cache.
 			pods: newPodStates(
 				podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
 				podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
@@ -363,7 +359,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			},
 			wantErr: false,
 			wantResponses: integrationutils.NewRequestBufferedResponse(
-				"192.168.1.1:8000",
+				"192.168.1.2:8000",
 				fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddableTarget),
 				&configPb.HeaderValueOption{
 					Header: &configPb.HeaderValue{

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ func NewScheduler(datastore Datastore) *Scheduler {`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`defaultProfile := framework.NewSchedulerProfile().`
`68`		`- WithFilters(filter.NewSheddableCapacityFilter(), lowLatencyFilter).`
	`68`	`+ WithFilters(lowLatencyFilter).`
`69`	`69`	`WithPicker(&picker.RandomPicker{})`
`70`	`70`
`71`	`71`	`profilePicker := profilepicker.NewAllProfilesPicker()`