Skip to content

Commit 8910624

Browse files
authored
Merge pull request #1454 from tkatila/xpum-sidecar-updates
Xpum sidecar updates
2 parents 53310c2 + 708b5b4 commit 8910624

File tree

3 files changed

+58
-19
lines changed

3 files changed

+58
-19
lines changed

cmd/xpumanager_sidecar/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ Intel GPUs can be interconnected via an XeLink. In some workloads it is benefici
2121
| -interval | int | 10 | Interval for XeLink topology fetching and label writing (seconds, >= 1) |
2222
| -startup-delay | int | 10 | Startup delay before the first topology fetching (seconds, >= 0) |
2323
| -label-namespace | string | gpu.intel.com | Namespace or prefix for the labels. i.e. **gpu.intel.com**/xe-links |
24+
| -allow-subdeviceless-links | bool | false | Include xelinks that are not on subdevices |
25+
| -use-https | bool | false | Use HTTPS protocol when connecting to XPU-Manager |
2426

2527
The sidecar also accepts a number of other arguments. Please use the -h option to see the complete list of options.
2628

cmd/xpumanager_sidecar/main.go

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"bufio"
1919
"bytes"
2020
"context"
21+
"crypto/tls"
2122
"flag"
2223
"fmt"
2324
"io"
@@ -55,15 +56,17 @@ type xpuManagerTopologyMatrixCell struct {
5556
}
5657

5758
type xpuManagerSidecar struct {
58-
getMetricsData func() []byte
59-
tmpDirPrefix string
60-
dstFilePath string
61-
labelNamespace string
62-
url string
63-
interval uint64
64-
startDelay uint64
65-
xpumPort uint64
66-
laneCount uint64
59+
getMetricsData func() []byte
60+
tmpDirPrefix string
61+
dstFilePath string
62+
labelNamespace string
63+
url string
64+
interval uint64
65+
startDelay uint64
66+
xpumPort uint64
67+
laneCount uint64
68+
allowSubdevicelessLinks bool
69+
useHTTPS bool
6770
}
6871

6972
func (e *invalidEntryErr) Error() string {
@@ -75,6 +78,14 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
7578
Timeout: 5 * time.Second,
7679
}
7780

81+
if xms.useHTTPS {
82+
customTransport := http.DefaultTransport.(*http.Transport).Clone()
83+
//#nosec
84+
customTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
85+
86+
client.Transport = customTransport
87+
}
88+
7889
ctx := context.Background()
7990

8091
req, err := http.NewRequestWithContext(ctx, http.MethodGet, xms.url, http.NoBody)
@@ -108,7 +119,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
108119
return resBody
109120
}
110121

111-
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
122+
func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
112123
cell := createInvalidTopologyCell()
113124

114125
for _, label := range labels {
@@ -118,7 +129,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
118129
klog.V(5).Info(name, " ", strVal)
119130

120131
// xelinks should always be on subdevices
121-
if name == "local_on_subdevice" && strVal != "true" {
132+
if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
122133
return cell, &invalidEntryErr{}
123134
}
124135

@@ -193,7 +204,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
193204
continue
194205
}
195206

196-
cell, err := processMetricsLabels(metric.Label)
207+
cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
197208
if err == nil {
198209
klog.V(5).Info("topology entry: ", cell)
199210
topologyInfos = append(topologyInfos, cell)
@@ -367,6 +378,8 @@ func main() {
367378
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
368379
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
369380
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
381+
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
382+
flag.BoolVar(&xms.useHTTPS, "use-https", false, "Use HTTPS protocol to connect to xpumanager")
370383
klog.InitFlags(nil)
371384

372385
flag.Parse()
@@ -375,7 +388,12 @@ func main() {
375388
klog.Fatal("zero interval won't work, set it to at least 1")
376389
}
377390

378-
xms.url = fmt.Sprintf("http://127.0.0.1:%d/metrics", xms.xpumPort)
391+
protocol := "http"
392+
if xms.useHTTPS {
393+
protocol = "https"
394+
}
395+
396+
xms.url = fmt.Sprintf("%s://127.0.0.1:%d/metrics", protocol, xms.xpumPort)
379397

380398
keepIterating := true
381399

cmd/xpumanager_sidecar/main_test.go

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ import (
2323
)
2424

2525
type testCase struct {
26-
name string
27-
metricsData []string
28-
expectedLabels []string
29-
minLaneCount int
26+
name string
27+
metricsData []string
28+
expectedLabels []string
29+
minLaneCount int
30+
allowSubdeviceless bool
3031
}
3132

3233
func createTestCases() []testCase {
@@ -59,12 +60,25 @@ func createTestCases() []testCase {
5960
metricsData: []string{
6061
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
6162
`# TYPE xpum_topology_link gauge`,
62-
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
63-
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
63+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
64+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
6465
"",
6566
},
6667
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
6768
},
69+
{
70+
name: "Xelinks not on sub devices when it's allowed",
71+
minLaneCount: 4,
72+
metricsData: []string{
73+
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
74+
`# TYPE xpum_topology_link gauge`,
75+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
76+
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
77+
"",
78+
},
79+
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
80+
allowSubdeviceless: true,
81+
},
6882
{
6983
name: "Xelinks without lan counts",
7084
minLaneCount: 4,
@@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
208222
for _, tc := range tcs {
209223
print("Testcase (labeling): ", tc.name, "\n")
210224
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
225+
226+
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
227+
211228
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
212229

213230
labels := xms.createLabels(topologyInfos)
@@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
224241
print("Testcase (iterate): ", tc.name, "\n")
225242
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
226243

244+
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
245+
227246
root, err := os.MkdirTemp("", "test_new_xms")
228247
if err != nil {
229248
t.Fatalf("can't create temporary directory: %+v", err)

0 commit comments

Comments
 (0)