Skip to content

Commit

Permalink
Merge pull request #14 from Nimbus318/feat/add-node-selector-support
Browse files Browse the repository at this point in the history
Add support for configuring NodeSelectors for different node types
  • Loading branch information
Nimbus318 authored Jan 9, 2025
2 parents f7a7837 + c1e38c0 commit f080a9d
Show file tree
Hide file tree
Showing 11 changed files with 81 additions and 47 deletions.
6 changes: 5 additions & 1 deletion charts/hami-webui/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@ data:
timeout: 1s
prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m
timeout: 1m
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
{{- end }}
10 changes: 8 additions & 2 deletions charts/hami-webui/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@

replicaCount: 1

vendorNodeSelectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on

image:
frontend:
repository: projecthami/hami-webui-fe-oss
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: "v1.0.4"
tag: "main"
backend:
repository: projecthami/hami-webui-be-oss
pullPolicy: IfNotPresent
tag: "v1.0.4"
tag: "main"

imagePullSecrets: []
nameOverride: ""
Expand Down
5 changes: 5 additions & 0 deletions server/cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/grpc"
"github.com/go-kratos/kratos/v2/transport/http"
"os"
"vgpu/internal/conf"

_ "go.uber.org/automaxprocs"
)
Expand Down Expand Up @@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
),
)
}

func getNodeSelectors(c *conf.Bootstrap) map[string]string {
return c.NodeSelectors
}
1 change: 1 addition & 0 deletions server/cmd/server/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error
service.ProviderSet,
exporter.ProviderSet,
newApp,
getNodeSelectors,
))
}
5 changes: 5 additions & 0 deletions server/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ server:
prometheus:
address: http://localhost:9090
timeout: 1m
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
1 change: 1 addition & 0 deletions server/internal/conf/conf.proto
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import "google/protobuf/duration.proto";
message Bootstrap {
Server server = 1;
Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
}

message Server {
Expand Down
10 changes: 5 additions & 5 deletions server/internal/data/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ type nodeRepo struct {
}

// NewNodeRepo .
func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo {
func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo {
nodeRepo := &nodeRepo{
data: data,
nodeNotify: make(chan struct{}, 1),
nodes: map[k8stypes.UID]*biz.Node{},
log: log.NewHelper(logger),
providers: []provider.Provider{
nvidia.NewNvidia(data.promCl, log.NewHelper(logger)),
mlu.NewCambricon(data.promCl, log.NewHelper(logger)),
ascend.NewAscend(data.promCl, log.NewHelper(logger)),
hygon.NewHygon(data.promCl, log.NewHelper(logger)),
nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]),
mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]),
ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]),
hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]),
},
}
nodeRepo.init()
Expand Down
27 changes: 15 additions & 12 deletions server/internal/provider/ascend/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,23 @@ import (
type Ascend struct {
prom *prom.Client
log *log.Helper

nodeSelectors string
}

func NewAscend(prom *prom.Client, log *log.Helper) *Ascend {
func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend {
return &Ascend{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}

func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("servertype=Ascend910B-20")
func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(a.nodeSelectors)
}

func (c *Ascend) GetProvider() string {
func (a *Ascend) GetProvider() string {
return AscendDevice
}

Expand All @@ -39,16 +42,16 @@ type DeviceMeta struct {
Driver string
}

func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
device := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString)
vs, err := a.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("query %s failed", queryString)
a.log.Warnf("query %s failed", queryString)
} else {
ds, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("vectorValue: %v, failed", vs)
a.log.Warnf("vectorValue: %v, failed", vs)
} else {
for _, d := range ds {
id := d.Metric["id"]
Expand All @@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
return device
}

func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {

nodedevices := []*util.DeviceInfo{}
i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := c.GetDevicesFromPrometheus(node)
tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok {
Expand Down
41 changes: 22 additions & 19 deletions server/internal/provider/hygon/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,23 @@ import (
type Hygon struct {
prom *prom.Client
log *log.Helper

nodeSelectors string
}

func NewHygon(prom *prom.Client, log *log.Helper) *Hygon {
func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
return &Hygon{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}

func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("dcu=on")
func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(h.nodeSelectors)
}

func (c *Hygon) GetProvider() string {
func (h *Hygon) GetProvider() string {
return HygonDCUDevice
}

Expand All @@ -40,19 +43,19 @@ type DeviceMeta struct {
Driver string
}

func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
deviceMap := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)

vs, err := c.prom.Query(context.Background(), queryString)
vs, err := h.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("Failed to query %s: %v", queryString, err)
h.log.Warnf("Failed to query %s: %v", queryString, err)
return deviceMap
}

vector, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("Unexpected result type: %v", vs)
h.log.Warnf("Unexpected result type: %v", vs)
return deviceMap
}

Expand All @@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev
return deviceMap
}

func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
devEncoded, ok := node.Annotations[RegisterAnnos]
if !ok {
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
}
nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log)
nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
if err != nil {
c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, err
}
if len(nodedevices) == 0 {
c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, errors.New("no gpu found on node")
}
devDecoded := util.EncodeNodeDevices(nodedevices, c.log)
c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := c.GetDevicesFromPrometheus(node)
devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := h.GetDevicesFromPrometheus(node)
for _, nodedevice := range nodedevices {
idParts := strings.Split(nodedevice.ID, "-")
if len(idParts) < 2 {
c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
continue
}

devDetailID := idParts[1]
devInfo, exists := devDetail[devDetailID]
if !exists {
c.log.Warnf("Device ID %s not found in devDetail", devDetailID)
h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
continue
}

Expand Down
11 changes: 7 additions & 4 deletions server/internal/provider/mlu/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@ import (
type Cambricon struct {
prom *prom.Client
log *log.Helper

labelsSelector string
}

func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon {
func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
return &Cambricon{
prom: prom,
log: log,
prom: prom,
log: log,
labelsSelector: labelSelector,
}
}

func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("mlu=on")
return labels.Parse(c.labelsSelector)
}

func (c *Cambricon) GetProvider() string {
Expand Down
11 changes: 7 additions & 4 deletions server/internal/provider/nvidia/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@ import (
type Nvidia struct {
prom *prom.Client
log *log.Helper

labelSelector string
}

func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia {
func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia {
return &Nvidia{
prom: prom,
log: log,
prom: prom,
log: log,
labelSelector: labelSelector,
}
}

func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("gpu=on")
return labels.Parse(n.labelSelector)
}

func (n *Nvidia) GetProvider() string {
Expand Down

0 comments on commit f080a9d

Please sign in to comment.