Skip to content

Commit db7cd12

Browse files
tkatilaeero-t
andcommitted
gpu: change 'none' allocation policy
With shared-dev-num and multiple i915s in the resource request, try to find as many individual GPUs to expose to the container. Previously, with multiple i915 resources, it was typical to get only one GPU device in the container. Co-authored-by: Eero Tamminen <[email protected]> Signed-off-by: Tuomas Katila <[email protected]>
1 parent 0bf4cd0 commit db7cd12

File tree

2 files changed

+74
-9
lines changed

2 files changed

+74
-9
lines changed

cmd/gpu_plugin/gpu_plugin.go

+47-4
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,58 @@ type cliOptions struct {
6464

6565
type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string
6666

67-
// nonePolicy is used for allocating GPU devices randomly.
67+
// nonePolicy is used for allocating GPU devices randomly, while trying
68+
// to select as many individual GPU devices as requested.
6869
func nonePolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string {
6970
klog.V(2).Info("Select nonePolicy for GPU device allocation")
7071

71-
deviceIds := req.AvailableDeviceIDs[0:req.AllocationSize]
72+
devices := make(map[string]bool)
73+
selected := make(map[string]bool)
74+
neededCount := req.AllocationSize
7275

73-
klog.V(2).Infof("Allocate deviceIds: %q", deviceIds)
76+
// When shared-dev-num is greater than 1, try to find as
77+
// many independent GPUs as possible, to satisfy the request.
7478

75-
return deviceIds
79+
for _, deviceID := range req.AvailableDeviceIDs {
80+
device := strings.Split(deviceID, "-")[0]
81+
82+
if _, found := devices[device]; !found {
83+
devices[device] = true
84+
selected[deviceID] = true
85+
neededCount--
86+
87+
if neededCount == 0 {
88+
break
89+
}
90+
}
91+
}
92+
93+
// If there were not enough independent GPUs, use remaining untaken deviceIDs.
94+
95+
if neededCount > 0 {
96+
for _, deviceID := range req.AvailableDeviceIDs {
97+
if _, found := selected[deviceID]; !found {
98+
selected[deviceID] = true
99+
neededCount--
100+
101+
if neededCount == 0 {
102+
break
103+
}
104+
}
105+
}
106+
}
107+
108+
// Convert selected map into an array
109+
110+
deviceIDs := []string{}
111+
112+
for deviceID := range selected {
113+
deviceIDs = append(deviceIDs, deviceID)
114+
}
115+
116+
klog.V(2).Infof("Allocate deviceIds: %q", deviceIDs)
117+
118+
return deviceIDs
76119
}
77120

78121
// balancedPolicy is used for allocating GPU devices in balance.

cmd/gpu_plugin/gpu_plugin_test.go

+27-5
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"path"
2121
"path/filepath"
2222
"reflect"
23+
"sort"
2324
"testing"
2425

2526
"github.com/pkg/errors"
@@ -99,12 +100,21 @@ func TestGetPreferredAllocation(t *testing.T) {
99100
rqt := &v1beta1.PreferredAllocationRequest{
100101
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
101102
{
102-
AvailableDeviceIDs: []string{"card0-4", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card0-2", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"},
103+
AvailableDeviceIDs: []string{"card0-4", "card0-2", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"},
103104
AllocationSize: 4,
104105
},
105106
},
106107
}
107108

109+
rqtNotEnough := &v1beta1.PreferredAllocationRequest{
110+
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
111+
{
112+
AvailableDeviceIDs: []string{"card0-1", "card0-2", "card0-3", "card1-1"},
113+
AllocationSize: 3,
114+
},
115+
},
116+
}
117+
108118
rqtErr := &v1beta1.PreferredAllocationRequest{
109119
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
110120
{
@@ -117,22 +127,24 @@ func TestGetPreferredAllocation(t *testing.T) {
117127
plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
118128
response, _ := plugin.GetPreferredAllocation(rqt)
119129

120-
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card2-4"}) {
121-
t.Error("Unexpected return value for none preferred allocation")
130+
sort.Strings(response.ContainerResponses[0].DeviceIDs)
131+
132+
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card3-4"}) {
133+
t.Error("Unexpected return value for none preferred allocation", response.ContainerResponses[0].DeviceIDs)
122134
}
123135

124136
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "balanced"})
125137
response, _ = plugin.GetPreferredAllocation(rqt)
126138

127139
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card1-0", "card2-0", "card3-0"}) {
128-
t.Error("Unexpected return value for balanced preferred allocation")
140+
t.Error("Unexpected return value for balanced preferred allocation", response.ContainerResponses[0].DeviceIDs)
129141
}
130142

131143
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "packed"})
132144
response, _ = plugin.GetPreferredAllocation(rqt)
133145

134146
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card0-1", "card0-2", "card0-3"}) {
135-
t.Error("Unexpected return value for packed preferred allocation")
147+
t.Error("Unexpected return value for packed preferred allocation", response.ContainerResponses[0].DeviceIDs)
136148
}
137149

138150
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
@@ -141,6 +153,16 @@ func TestGetPreferredAllocation(t *testing.T) {
141153
if response != nil {
142154
t.Error("Fail to handle the input error that req.AllocationSize is greater than len(req.AvailableDeviceIDs).")
143155
}
156+
157+
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
158+
response, _ = plugin.GetPreferredAllocation(rqtNotEnough)
159+
160+
sort.Strings(response.ContainerResponses[0].DeviceIDs)
161+
162+
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-1", "card0-2", "card1-1"}) {
163+
t.Error("Unexpected return value for none preferred allocation with too few separate devices",
164+
response.ContainerResponses[0].DeviceIDs)
165+
}
144166
}
145167

146168
func TestAllocate(t *testing.T) {

0 commit comments

Comments
 (0)