Skip to content

Commit 8e112db

Browse files
authored
Remove redundant SheddableCapacityFilter. (#910)
Admission control/capacity management is now handled in `requestcontrol.Director.PreDispatch` (and soon to be absorbed into the new Flow Controller). This should no longer be a responsibility of the scheduling framework and this check is already being applied in #805 prior to the scheduling layer being invoked. This is not a no-op change. Previously, the `SheddableCapacityFilter`, in addition to dropping sheddable requests when at capacity, would also strictly filter the pods that the rest of the scheduling plugins would consider as input. This change removes that strict filtering so all pods are now considered so long as the system is not considered saturated. This means sheddable requests now follow the same scheduling path as critical requests provided they are not dropped by the saturation detection check in `PreDispatch`.
1 parent ee64ec6 commit 8e112db

File tree

6 files changed

+7
-227
lines changed

6 files changed

+7
-227
lines changed

cmd/epp/main.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ import (
4444
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
4545
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
4646
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
47-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
4847
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
4948
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
5049
profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
@@ -209,7 +208,6 @@ func run() error {
209208
kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
210209

211210
schedulerProfile := framework.NewSchedulerProfile().
212-
WithFilters(filter.NewSheddableCapacityFilter()).
213211
WithScorers(framework.NewWeightedScorer(&scorer.QueueScorer{}, queueScorerWeight),
214212
framework.NewWeightedScorer(&scorer.KVCacheScorer{}, kvCacheScorerWeight)).
215213
WithPicker(picker.NewMaxScorePicker())

pkg/epp/scheduling/framework/plugins/filter/filter_test.go

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -134,42 +134,6 @@ func TestFilter(t *testing.T) {
134134
},
135135
},
136136
},
137-
{
138-
name: "SheddableCapacityFilter, sheddable request",
139-
req: &types.LLMRequest{Critical: false},
140-
filter: &SheddableCapacityFilter{queueThreshold: 0, kvCacheThreshold: 0.8},
141-
input: []types.Pod{
142-
&types.PodMetrics{
143-
// This pod should be returned.
144-
MetricsState: &backendmetrics.MetricsState{
145-
WaitingQueueSize: 0,
146-
KVCacheUsagePercent: 0,
147-
},
148-
},
149-
&types.PodMetrics{
150-
// Queue is non zero, despite low kv cache, should not return.
151-
MetricsState: &backendmetrics.MetricsState{
152-
WaitingQueueSize: 1,
153-
KVCacheUsagePercent: 0.3,
154-
},
155-
},
156-
&types.PodMetrics{
157-
// High kv cache despite zero queue, should not return
158-
MetricsState: &backendmetrics.MetricsState{
159-
WaitingQueueSize: 0,
160-
KVCacheUsagePercent: 1.0,
161-
},
162-
},
163-
},
164-
output: []types.Pod{
165-
&types.PodMetrics{
166-
MetricsState: &backendmetrics.MetricsState{
167-
WaitingQueueSize: 0,
168-
KVCacheUsagePercent: 0,
169-
},
170-
},
171-
},
172-
},
173137
}
174138

175139
for _, test := range tests {
@@ -241,7 +205,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
241205
// initialize LoraAffinityFilter
242206
LoraAffinityFilter := NewLoraAffinityFilter()
243207

244-
for i := 0; i < numIterations; i++ {
208+
for range numIterations {
245209
result := LoraAffinityFilter.Filter(context.Background(), req, types.NewCycleState(), pods)
246210

247211
// Check which type of pod was returned

pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go

Lines changed: 0 additions & 64 deletions
This file was deleted.

pkg/epp/scheduling/scheduler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func NewScheduler(datastore Datastore) *Scheduler {
6565
}
6666

6767
defaultProfile := framework.NewSchedulerProfile().
68-
WithFilters(filter.NewSheddableCapacityFilter(), lowLatencyFilter).
68+
WithFilters(lowLatencyFilter).
6969
WithPicker(&picker.RandomPicker{})
7070

7171
profilePicker := profilepicker.NewAllProfilesPicker()

pkg/epp/scheduling/scheduler_test.go

Lines changed: 1 addition & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func TestSchedule(t *testing.T) {
4949
err: true,
5050
},
5151
{
52-
name: "critical request",
52+
name: "finds optimal pod",
5353
req: &types.LLMRequest{
5454
TargetModel: "critical",
5555
RequestId: uuid.NewString(),
@@ -114,120 +114,6 @@ func TestSchedule(t *testing.T) {
114114
},
115115
},
116116
},
117-
{
118-
name: "sheddable request, accepted",
119-
req: &types.LLMRequest{
120-
TargetModel: "sheddable",
121-
RequestId: uuid.NewString(),
122-
Critical: false,
123-
},
124-
// pod1 will be picked because it has capacity for the sheddable request.
125-
input: []*backendmetrics.FakePodMetrics{
126-
{
127-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
128-
Metrics: &backendmetrics.MetricsState{
129-
WaitingQueueSize: 0,
130-
KVCacheUsagePercent: 0.2,
131-
MaxActiveModels: 2,
132-
ActiveModels: map[string]int{
133-
"foo": 1,
134-
"bar": 1,
135-
},
136-
},
137-
},
138-
{
139-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
140-
Metrics: &backendmetrics.MetricsState{
141-
WaitingQueueSize: 3,
142-
KVCacheUsagePercent: 0.1,
143-
MaxActiveModels: 2,
144-
ActiveModels: map[string]int{
145-
"foo": 1,
146-
"critical": 1,
147-
},
148-
},
149-
},
150-
{
151-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
152-
Metrics: &backendmetrics.MetricsState{
153-
WaitingQueueSize: 10,
154-
KVCacheUsagePercent: 0.2,
155-
MaxActiveModels: 2,
156-
ActiveModels: map[string]int{
157-
"foo": 1,
158-
},
159-
},
160-
},
161-
},
162-
wantRes: map[string]*types.Result{
163-
"default": {
164-
TargetPod: &types.ScoredPod{
165-
Pod: &types.PodMetrics{
166-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, Labels: make(map[string]string)},
167-
MetricsState: &backendmetrics.MetricsState{
168-
WaitingQueueSize: 0,
169-
KVCacheUsagePercent: 0.2,
170-
MaxActiveModels: 2,
171-
ActiveModels: map[string]int{
172-
"foo": 1,
173-
"bar": 1,
174-
},
175-
WaitingModels: map[string]int{},
176-
},
177-
},
178-
},
179-
},
180-
},
181-
},
182-
{
183-
name: "sheddable request, dropped",
184-
req: &types.LLMRequest{
185-
TargetModel: "sheddable",
186-
RequestId: uuid.NewString(),
187-
Critical: false,
188-
},
189-
// All pods have higher KV cache thant the threshold, so the sheddable request will be
190-
// dropped.
191-
input: []*backendmetrics.FakePodMetrics{
192-
{
193-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
194-
Metrics: &backendmetrics.MetricsState{
195-
WaitingQueueSize: 10,
196-
KVCacheUsagePercent: 0.9,
197-
MaxActiveModels: 2,
198-
ActiveModels: map[string]int{
199-
"foo": 1,
200-
"bar": 1,
201-
},
202-
},
203-
},
204-
{
205-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
206-
Metrics: &backendmetrics.MetricsState{
207-
WaitingQueueSize: 3,
208-
KVCacheUsagePercent: 0.85,
209-
MaxActiveModels: 2,
210-
ActiveModels: map[string]int{
211-
"foo": 1,
212-
"critical": 1,
213-
},
214-
},
215-
},
216-
{
217-
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
218-
Metrics: &backendmetrics.MetricsState{
219-
WaitingQueueSize: 10,
220-
KVCacheUsagePercent: 0.85,
221-
MaxActiveModels: 2,
222-
ActiveModels: map[string]int{
223-
"foo": 1,
224-
},
225-
},
226-
},
227-
},
228-
wantRes: nil,
229-
err: true,
230-
},
231117
}
232118

233119
for _, test := range tests {

test/integration/epp/hermetic_test.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
293293
{
294294
name: "noncritical, but one server has capacity, do not shed",
295295
requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", modelSheddable),
296-
// pod 0: selected
297-
// pod 1: excluded; above KV cache threshold
298-
// pod 2: excluded; above queue size threshold
296+
// Pod 1 will be picked because it has relatively low queue size and low KV cache.
299297
pods: newPodStates(
300298
podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
301299
podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
@@ -309,7 +307,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
309307
},
310308
wantErr: false,
311309
wantResponses: integrationutils.NewRequestBufferedResponse(
312-
"192.168.1.1:8000",
310+
"192.168.1.2:8000",
313311
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test5","temperature":0}`, modelSheddableTarget),
314312
&configPb.HeaderValueOption{
315313
Header: &configPb.HeaderValue{
@@ -347,9 +345,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
347345
},
348346
},
349347
},
350-
// pod 0: selected
351-
// pod 1: excluded; above KV cache threshold
352-
// pod 2: excluded; above queue size threshold
348+
// Pod 1 will be picked because it has relatively low queue size and low KV cache.
353349
pods: newPodStates(
354350
podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
355351
podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
@@ -363,7 +359,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
363359
},
364360
wantErr: false,
365361
wantResponses: integrationutils.NewRequestBufferedResponse(
366-
"192.168.1.1:8000",
362+
"192.168.1.2:8000",
367363
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddableTarget),
368364
&configPb.HeaderValueOption{
369365
Header: &configPb.HeaderValue{

0 commit comments

Comments
 (0)