Skip to content

Commit 54d70c9

Browse files
Adding support for service monitor on NIM Service (#58)
* Adding support for service monitor on NIM Service Signed-off-by: Vishesh Tanksale <[email protected]> * Updating service monitor on NIM Service Signed-off-by: Vishesh Tanksale <[email protected]> --------- Signed-off-by: Vishesh Tanksale <[email protected]>
1 parent e857583 commit 54d70c9

File tree

14 files changed

+136
-31
lines changed

14 files changed

+136
-31
lines changed

api/apps/v1alpha1/common_types.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package v1alpha1
1818

1919
import (
20+
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2021
autoscalingv2 "k8s.io/api/autoscaling/v2"
2122
corev1 "k8s.io/api/core/v1"
2223
networkingv1 "k8s.io/api/networking/v1"
@@ -46,8 +47,9 @@ type Metrics struct {
4647

4748
// ServiceMonitor defines attributes to create a service monitor
4849
type ServiceMonitor struct {
49-
Create *bool `json:"enabled,omitempty"`
5050
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`
51+
Interval promv1.Duration `json:"interval,omitempty"`
52+
ScrapeTimeout promv1.Duration `json:"scrapeTimeout,omitempty"`
5153
}
5254

5355
// Autoscaling defines attributes to automatically scale the service based on metrics

api/apps/v1alpha1/nimservice_types.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ package v1alpha1
1818

1919
import (
2020
"fmt"
21+
"maps"
2122
"os"
2223

2324
rendertypes "github.com/NVIDIA/k8s-nim-operator/internal/render/types"
2425
utils "github.com/NVIDIA/k8s-nim-operator/internal/utils"
26+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2527
autoscalingv2 "k8s.io/api/autoscaling/v2"
2628
corev1 "k8s.io/api/core/v1"
2729
networkingv1 "k8s.io/api/networking/v1"
@@ -440,6 +442,11 @@ func (n *NIMService) GetHPA() HorizontalPodAutoscalerSpec {
440442
return n.Spec.Scale.HPA
441443
}
442444

445+
// GetServiceMonitor returns the Service Monitor details for the NIMService deployment
446+
func (n *NIMService) GetServiceMonitor() ServiceMonitor {
447+
return n.Spec.Metrics.ServiceMonitor
448+
}
449+
443450
// GetReplicas returns replicas for the NIMService deployment
444451
func (n *NIMService) GetReplicas() int {
445452
if n.IsAutoScalingEnabled() {
@@ -609,6 +616,7 @@ func (n *NIMService) GetServiceParams() *rendertypes.ServiceParams {
609616

610617
// Set service ports
611618
params.Port = n.GetServicePort()
619+
params.PortName = "open-ai-port"
612620
return params
613621
}
614622

@@ -699,6 +707,28 @@ func (n *NIMService) GetSCCParams() *rendertypes.SCCParams {
699707
return params
700708
}
701709

710+
// GetServiceMonitorParams return params to render Service Monitor from templates
711+
func (n *NIMService) GetServiceMonitorParams() *rendertypes.ServiceMonitorParams {
712+
params := &rendertypes.ServiceMonitorParams{}
713+
serviceMonitor := n.GetServiceMonitor()
714+
params.Enabled = n.IsServiceMonitorEnabled()
715+
params.Name = n.GetName()
716+
params.Namespace = n.GetNamespace()
717+
svcLabels := n.GetServiceLabels()
718+
maps.Copy(svcLabels, serviceMonitor.AdditionalLabels)
719+
params.Labels = svcLabels
720+
params.Annotations = n.GetServiceAnnotations()
721+
722+
// Set Service Monitor spec
723+
smSpec := monitoringv1.ServiceMonitorSpec{
724+
NamespaceSelector: monitoringv1.NamespaceSelector{MatchNames: []string{n.Namespace}},
725+
Selector: metav1.LabelSelector{MatchLabels: n.GetServiceLabels()},
726+
Endpoints: []monitoringv1.Endpoint{{Port: "open-ai-port", ScrapeTimeout: serviceMonitor.ScrapeTimeout, Interval: serviceMonitor.Interval}},
727+
}
728+
params.SMSpec = smSpec
729+
return params
730+
}
731+
702732
func init() {
703733
SchemeBuilder.Register(&NIMService{}, &NIMServiceList{})
704734
}

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimpipelines.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,20 @@ spec:
695695
additionalProperties:
696696
type: string
697697
type: object
698-
enabled:
699-
type: boolean
698+
interval:
699+
description: |-
700+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
701+
Supported units: y, w, d, h, m, s, ms
702+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
703+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
704+
type: string
705+
scrapeTimeout:
706+
description: |-
707+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
708+
Supported units: y, w, d, h, m, s, ms
709+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
710+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
711+
type: string
700712
type: object
701713
type: object
702714
nimCache:

bundle/manifests/apps.nvidia.com_nimservices.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,20 @@ spec:
673673
additionalProperties:
674674
type: string
675675
type: object
676-
enabled:
677-
type: boolean
676+
interval:
677+
description: |-
678+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
679+
Supported units: y, w, d, h, m, s, ms
680+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
681+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
682+
type: string
683+
scrapeTimeout:
684+
description: |-
685+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
686+
Supported units: y, w, d, h, m, s, ms
687+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
688+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
689+
type: string
678690
type: object
679691
type: object
680692
nimCache:

cmd/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"flag"
2222
"os"
2323

24+
monitoring "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2425
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
2526
// to ensure that exec-entrypoint and run can make use of them.
2627
_ "k8s.io/client-go/plugin/pkg/client/auth"

config/crd/bases/apps.nvidia.com_nimpipelines.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,20 @@ spec:
695695
additionalProperties:
696696
type: string
697697
type: object
698-
enabled:
699-
type: boolean
698+
interval:
699+
description: |-
700+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
701+
Supported units: y, w, d, h, m, s, ms
702+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
703+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
704+
type: string
705+
scrapeTimeout:
706+
description: |-
707+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
708+
Supported units: y, w, d, h, m, s, ms
709+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
710+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
711+
type: string
700712
type: object
701713
type: object
702714
nimCache:

config/crd/bases/apps.nvidia.com_nimservices.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,20 @@ spec:
673673
additionalProperties:
674674
type: string
675675
type: object
676-
enabled:
677-
type: boolean
676+
interval:
677+
description: |-
678+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
679+
Supported units: y, w, d, h, m, s, ms
680+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
681+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
682+
type: string
683+
scrapeTimeout:
684+
description: |-
685+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
686+
Supported units: y, w, d, h, m, s, ms
687+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
688+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
689+
type: string
678690
type: object
679691
type: object
680692
nimCache:

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,20 @@ spec:
695695
additionalProperties:
696696
type: string
697697
type: object
698-
enabled:
699-
type: boolean
698+
interval:
699+
description: |-
700+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
701+
Supported units: y, w, d, h, m, s, ms
702+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
703+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
704+
type: string
705+
scrapeTimeout:
706+
description: |-
707+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
708+
Supported units: y, w, d, h, m, s, ms
709+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
710+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
711+
type: string
700712
type: object
701713
type: object
702714
nimCache:

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,20 @@ spec:
673673
additionalProperties:
674674
type: string
675675
type: object
676-
enabled:
677-
type: boolean
676+
interval:
677+
description: |-
678+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
679+
Supported units: y, w, d, h, m, s, ms
680+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
681+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
682+
type: string
683+
scrapeTimeout:
684+
description: |-
685+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
686+
Supported units: y, w, d, h, m, s, ms
687+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
688+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
689+
type: string
678690
type: object
679691
type: object
680692
nimCache:

internal/conditions/conditions.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ const (
4747
ReasonHPAFailed = "HPAFailed"
4848
// ReasonSCCFailed indicates that the creation of scc has failed
4949
ReasonSCCFailed = "SCCFailed"
50+
// ReasonServiceMonitorFailed indicates that the creation of Service Monitor has failed
51+
ReasonServiceMonitorFailed = "ServiceMonitorFailed"
5052
// ReasonDeploymentFailed indicates that the creation of deployment has failed
5153
ReasonDeploymentFailed = "DeploymentFailed"
5254
// ReasonStatefulSetFailed indicates that the creation of statefulset has failed

internal/controller/platform/standalone/nimservice.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
2727
"github.com/NVIDIA/k8s-nim-operator/internal/utils"
2828
"github.com/go-logr/logr"
29+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2930
appsv1 "k8s.io/api/apps/v1"
3031
autoscalingv1 "k8s.io/api/autoscaling/v1"
3132
corev1 "k8s.io/api/core/v1"
@@ -132,6 +133,16 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
132133
}
133134
}
134135

136+
// Sync Service Monitor
137+
if nimService.IsServiceMonitorEnabled() {
138+
err = r.renderAndSyncResource(ctx, nimService, &renderer, &monitoringv1.ServiceMonitor{}, func() (client.Object, error) {
139+
return renderer.ServiceMonitor(nimService.GetServiceMonitorParams())
140+
}, "servicemonitor", conditions.ReasonServiceMonitorFailed)
141+
if err != nil {
142+
return ctrl.Result{}, err
143+
}
144+
}
145+
135146
deploymentParams := nimService.GetDeploymentParams()
136147
var modelPVC *appsv1alpha1.PersistentVolumeClaim
137148
modelProfile := ""

internal/render/types/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package types
1818

1919
import (
20+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
2021
autoscalingv2 "k8s.io/api/autoscaling/v2"
2122
corev1 "k8s.io/api/core/v1"
2223
networkingv1 "k8s.io/api/networking/v1"
@@ -197,4 +198,5 @@ type ServiceMonitorParams struct {
197198
Path string
198199
Interval int32
199200
ScrapeTimeout int32
201+
SMSpec monitoringv1.ServiceMonitorSpec
200202
}

manifests/servicemonitor.yaml

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,7 @@ metadata:
1313
{{- .Annotations | yaml | nindent 4 }}
1414
{{- end }}
1515
spec:
16-
selector:
17-
labels:
18-
{{- if .MatchLabels }}
19-
{{- .MatchLabels | yaml | nindent 6 }}
20-
{{- end }}
21-
namespaceSelector:
22-
matchNames:
23-
- {{ .Namespace }}
24-
endpoints:
25-
- port: {{ .Port }}
26-
path: {{ .Path }}
27-
interval: {{ .Interval }}
28-
scrapeTimeout: {{ .ScrapeTimeout }}
16+
{{- if .SMSpec }}
17+
{{- .SMSpec | yaml | nindent 2 }}
18+
{{- end }}
2919
{{- end }}

0 commit comments

Comments
 (0)