Skip to content

Commit

Permalink
Fixing Evaluator to add init containers
Browse files Browse the repository at this point in the history
Signed-off-by: Vishesh Tanksale <[email protected]>
  • Loading branch information
visheshtanksale committed Feb 6, 2025
1 parent e14dddc commit c12ae77
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 49 deletions.
2 changes: 1 addition & 1 deletion api/apps/v1alpha1/nemo_common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ type OTelSpec struct {
Enabled *bool `json:"enabled,omitempty"`

// ExporterOtlpEndpoint is the OTLP collector endpoint.
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:Optional
ExporterOtlpEndpoint string `json:"exporterOtlpEndpoint"`

// DisableLogging indicates whether Python logging auto-instrumentation should be disabled.
Expand Down
67 changes: 67 additions & 0 deletions api/apps/v1alpha1/nemo_evaluator_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ import (
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
rbacv1 "k8s.io/api/rbac/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
Expand Down Expand Up @@ -826,6 +828,71 @@ func (n *NemoEvaluator) GetServiceMonitorAnnotations() map[string]string {
return NemoEvaluatorAnnotations
}

// GetInitContainers returns the init containers for the NemoEvaluator.
//
// It creates and returns a slice of corev1.Container.
// The init containers include a busybox container to wait for Postgres to start,
// and an evaluator-db-migration container to run the database migration.
//
// Returns a slice of corev1.Container.
func (n *NemoEvaluator) GetInitContainers() []corev1.Container {

connCmd := fmt.Sprintf(
"until nc -z %s %d; do echo \"Waiting for Postgres to start \"; sleep 5; done",
n.Spec.DatabaseConfig.Host,
n.Spec.DatabaseConfig.Port)

envVars := []corev1.EnvVar{
{
Name: "NAMESPACE",
Value: n.Namespace,
},
{
Name: "ARGO_HOST",
Value: n.Spec.ArgoWorkflows.Endpoint,
},
{
Name: "EVAL_CONTAINER",
Value: n.GetImage(),
},
{
Name: "DATA_STORE_HOST",
Value: n.Spec.Datastore.Endpoint,
},
}
// Append the environment variables for Postgres
envVars = append(envVars, n.GetPostgresEnv()...)

return []corev1.Container{
{
Name: "wait-for-postgres",
Image: "busybox",
ImagePullPolicy: corev1.PullPolicy(n.GetImagePullPolicy()),
Command: []string{
"sh", "-c", connCmd, "do echo \"Waiting for Postgres to start\"", "sleep 5", "done",
},
},
{
Name: "evaluator-db-migration",
Image: n.GetImage(),
ImagePullPolicy: corev1.PullPolicy(n.GetImagePullPolicy()),
Command: []string{
"sh", "-c", "/app/scripts/run-db-migration.sh",
},
SecurityContext: &corev1.SecurityContext{
RunAsUser: ptr.To[int64](0),
},
Env: envVars,
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
},
},
}
}

func init() {
SchemeBuilder.Register(&NemoEvaluator{}, &NemoEvaluatorList{})
}
3 changes: 0 additions & 3 deletions bundle/manifests/apps.nvidia.com_nemocustomizers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -898,8 +897,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
3 changes: 0 additions & 3 deletions bundle/manifests/apps.nvidia.com_nemoevaluators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -945,8 +944,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
3 changes: 0 additions & 3 deletions config/crd/bases/apps.nvidia.com_nemocustomizers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -898,8 +897,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
3 changes: 0 additions & 3 deletions config/crd/bases/apps.nvidia.com_nemoevaluators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -945,8 +944,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -898,8 +897,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,6 @@ spec:
type: object
exporterOtlpEndpoint:
description: ExporterOtlpEndpoint is the OTLP collector endpoint.
minLength: 1
type: string
logLevel:
default: INFO
Expand All @@ -945,8 +944,6 @@ spec:
- INFO
- DEBUG
type: string
required:
- exporterOtlpEndpoint
type: object
podAffinity:
description: Pod affinity is a group of inter pod affinity scheduling
Expand Down
69 changes: 39 additions & 30 deletions internal/controller/nemo_evaluator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,56 +262,56 @@ func (r *NemoEvaluatorReconciler) refreshMetrics(ctx context.Context) {
}
}

func (r *NemoEvaluatorReconciler) reconcileNemoEvaluator(ctx context.Context, NemoEvaluator *appsv1alpha1.NemoEvaluator) (ctrl.Result, error) {
func (r *NemoEvaluatorReconciler) reconcileNemoEvaluator(ctx context.Context, nemoEvaluator *appsv1alpha1.NemoEvaluator) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var err error
defer func() {
if err != nil {
r.GetEventRecorder().Eventf(NemoEvaluator, corev1.EventTypeWarning, conditions.Failed,
"NemoEvaluator %s failed, msg: %s", NemoEvaluator.Name, err.Error())
r.GetEventRecorder().Eventf(nemoEvaluator, corev1.EventTypeWarning, conditions.Failed,
"NemoEvaluator %s failed, msg: %s", nemoEvaluator.Name, err.Error())
}
}()
// Generate annotation for the current operator-version and apply to all resources
// Get generic name for all resources
namespacedName := types.NamespacedName{Name: NemoEvaluator.GetName(), Namespace: NemoEvaluator.GetNamespace()}
namespacedName := types.NamespacedName{Name: nemoEvaluator.GetName(), Namespace: nemoEvaluator.GetNamespace()}
renderer := r.GetRenderer()

// Sync serviceaccount
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &corev1.ServiceAccount{}, func() (client.Object, error) {
return renderer.ServiceAccount(NemoEvaluator.GetServiceAccountParams())
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &corev1.ServiceAccount{}, func() (client.Object, error) {
return renderer.ServiceAccount(nemoEvaluator.GetServiceAccountParams())
}, "serviceaccount", conditions.ReasonServiceAccountFailed)
if err != nil {
return ctrl.Result{}, err
}

// Sync role
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &rbacv1.Role{}, func() (client.Object, error) {
return renderer.Role(NemoEvaluator.GetRoleParams())
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &rbacv1.Role{}, func() (client.Object, error) {
return renderer.Role(nemoEvaluator.GetRoleParams())
}, "role", conditions.ReasonRoleFailed)
if err != nil {
return ctrl.Result{}, err
}

// Sync rolebinding
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &rbacv1.RoleBinding{}, func() (client.Object, error) {
return renderer.RoleBinding(NemoEvaluator.GetRoleBindingParams())
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &rbacv1.RoleBinding{}, func() (client.Object, error) {
return renderer.RoleBinding(nemoEvaluator.GetRoleBindingParams())
}, "rolebinding", conditions.ReasonRoleBindingFailed)
if err != nil {
return ctrl.Result{}, err
}

// Sync service
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &corev1.Service{}, func() (client.Object, error) {
return renderer.Service(NemoEvaluator.GetServiceParams())
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &corev1.Service{}, func() (client.Object, error) {
return renderer.Service(nemoEvaluator.GetServiceParams())
}, "service", conditions.ReasonServiceFailed)
if err != nil {
return ctrl.Result{}, err
}

// Sync ingress
if NemoEvaluator.IsIngressEnabled() {
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &networkingv1.Ingress{}, func() (client.Object, error) {
return renderer.Ingress(NemoEvaluator.GetIngressParams())
if nemoEvaluator.IsIngressEnabled() {
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &networkingv1.Ingress{}, func() (client.Object, error) {
return renderer.Ingress(nemoEvaluator.GetIngressParams())
}, "ingress", conditions.ReasonIngressFailed)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -324,9 +324,9 @@ func (r *NemoEvaluatorReconciler) reconcileNemoEvaluator(ctx context.Context, Ne
}

// Sync HPA
if NemoEvaluator.IsAutoScalingEnabled() {
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &autoscalingv2.HorizontalPodAutoscaler{}, func() (client.Object, error) {
return renderer.HPA(NemoEvaluator.GetHPAParams())
if nemoEvaluator.IsAutoScalingEnabled() {
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &autoscalingv2.HorizontalPodAutoscaler{}, func() (client.Object, error) {
return renderer.HPA(nemoEvaluator.GetHPAParams())
}, "hpa", conditions.ReasonHPAFailed)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -340,20 +340,29 @@ func (r *NemoEvaluatorReconciler) reconcileNemoEvaluator(ctx context.Context, Ne
}

// Sync Service Monitor
if NemoEvaluator.IsServiceMonitorEnabled() {
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &monitoringv1.ServiceMonitor{}, func() (client.Object, error) {
return renderer.ServiceMonitor(NemoEvaluator.GetServiceMonitorParams())
if nemoEvaluator.IsServiceMonitorEnabled() {
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &monitoringv1.ServiceMonitor{}, func() (client.Object, error) {
return renderer.ServiceMonitor(nemoEvaluator.GetServiceMonitorParams())
}, "servicemonitor", conditions.ReasonServiceMonitorFailed)
if err != nil {
return ctrl.Result{}, err
}
}

deploymentParams := NemoEvaluator.GetDeploymentParams()
deploymentParams := nemoEvaluator.GetDeploymentParams()

// Sync deployment
err = r.renderAndSyncResource(ctx, NemoEvaluator, &renderer, &appsv1.Deployment{}, func() (client.Object, error) {
return renderer.Deployment(deploymentParams)
err = r.renderAndSyncResource(ctx, nemoEvaluator, &renderer, &appsv1.Deployment{}, func() (client.Object, error) {

result, err := renderer.Deployment(deploymentParams)
if err != nil {
return nil, err
}
initContainers := nemoEvaluator.GetInitContainers()
if len(initContainers) > 0 {
result.Spec.Template.Spec.InitContainers = initContainers
}
return result, err
}, "deployment", conditions.ReasonDeploymentFailed)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -367,14 +376,14 @@ func (r *NemoEvaluatorReconciler) reconcileNemoEvaluator(ctx context.Context, Ne

if !ready {
// Update status as NotReady
err = r.updater.SetConditionsNotReady(ctx, NemoEvaluator, conditions.NotReady, msg)
r.GetEventRecorder().Eventf(NemoEvaluator, corev1.EventTypeNormal, conditions.NotReady,
"NemoEvaluator %s not ready yet, msg: %s", NemoEvaluator.Name, msg)
err = r.updater.SetConditionsNotReady(ctx, nemoEvaluator, conditions.NotReady, msg)
r.GetEventRecorder().Eventf(nemoEvaluator, corev1.EventTypeNormal, conditions.NotReady,
"NemoEvaluator %s not ready yet, msg: %s", nemoEvaluator.Name, msg)
} else {
// Update status as ready
err = r.updater.SetConditionsReady(ctx, NemoEvaluator, conditions.Ready, msg)
r.GetEventRecorder().Eventf(NemoEvaluator, corev1.EventTypeNormal, conditions.Ready,
"NemoEvaluator %s ready, msg: %s", NemoEvaluator.Name, msg)
err = r.updater.SetConditionsReady(ctx, nemoEvaluator, conditions.Ready, msg)
r.GetEventRecorder().Eventf(nemoEvaluator, corev1.EventTypeNormal, conditions.Ready,
"NemoEvaluator %s ready, msg: %s", nemoEvaluator.Name, msg)
}

if err != nil {
Expand Down

0 comments on commit c12ae77

Please sign in to comment.