Skip to content

Commit e34e93b

Browse files
committed
xpum sidecar: allow xelinks that are not tied to subdevices
With one tile GPUs, xelinks are no longer advertised to be on subdevices. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 53310c2 commit e34e93b

File tree

2 files changed

+39
-18
lines changed

2 files changed

+39
-18
lines changed

cmd/xpumanager_sidecar/main.go

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct {
5555
}
5656

5757
type xpuManagerSidecar struct {
58-
getMetricsData func() []byte
59-
tmpDirPrefix string
60-
dstFilePath string
61-
labelNamespace string
62-
url string
63-
interval uint64
64-
startDelay uint64
65-
xpumPort uint64
66-
laneCount uint64
58+
getMetricsData func() []byte
59+
tmpDirPrefix string
60+
dstFilePath string
61+
labelNamespace string
62+
url string
63+
interval uint64
64+
startDelay uint64
65+
xpumPort uint64
66+
laneCount uint64
67+
allowSubdevicelessLinks bool
6768
}
6869

6970
func (e *invalidEntryErr) Error() string {
@@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
108109
return resBody
109110
}
110111

111-
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
112+
func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
112113
cell := createInvalidTopologyCell()
113114

114115
for _, label := range labels {
@@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
118119
klog.V(5).Info(name, " ", strVal)
119120

120121
// xelinks should always be on subdevices
121-
if name == "local_on_subdevice" && strVal != "true" {
122+
if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
122123
return cell, &invalidEntryErr{}
123124
}
124125

@@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
193194
continue
194195
}
195196

196-
cell, err := processMetricsLabels(metric.Label)
197+
cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
197198
if err == nil {
198199
klog.V(5).Info("topology entry: ", cell)
199200
topologyInfos = append(topologyInfos, cell)
@@ -367,6 +368,7 @@ func main() {
367368
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
368369
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
369370
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
371+
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
370372
klog.InitFlags(nil)
371373

372374
flag.Parse()

cmd/xpumanager_sidecar/main_test.go

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ import (
2323
)
2424

2525
type testCase struct {
26-
name string
27-
metricsData []string
28-
expectedLabels []string
29-
minLaneCount int
26+
name string
27+
metricsData []string
28+
expectedLabels []string
29+
minLaneCount int
30+
allowSubdeviceless bool
3031
}
3132

3233
func createTestCases() []testCase {
@@ -59,12 +60,25 @@ func createTestCases() []testCase {
5960
metricsData: []string{
6061
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
6162
`# TYPE xpum_topology_link gauge`,
62-
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
63-
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
63+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
64+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
6465
"",
6566
},
6667
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
6768
},
69+
{
70+
name: "Xelinks not on sub devices when it's allowed",
71+
minLaneCount: 4,
72+
metricsData: []string{
73+
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
74+
`# TYPE xpum_topology_link gauge`,
75+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
76+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
77+
"",
78+
},
79+
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
80+
allowSubdeviceless: true,
81+
},
6882
{
6983
name: "Xelinks without lan counts",
7084
minLaneCount: 4,
@@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
208222
for _, tc := range tcs {
209223
print("Testcase (labeling): ", tc.name, "\n")
210224
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
225+
226+
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
227+
211228
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
212229

213230
labels := xms.createLabels(topologyInfos)
@@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
224241
print("Testcase (iterate): ", tc.name, "\n")
225242
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
226243

244+
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
245+
227246
root, err := os.MkdirTemp("", "test_new_xms")
228247
if err != nil {
229248
t.Fatalf("can't create temporary directory: %+v", err)

0 commit comments

Comments
 (0)