@@ -495,73 +495,112 @@ func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
495
495
return nil , nil
496
496
}
497
497
498
- func (r * NIMCacheReconciler ) reconcileModelSelection (ctx context.Context , nimCache * appsv1alpha1.NIMCache ) (requeue bool , err error ) {
498
+ func (r * NIMCacheReconciler ) reconcileModelManifest (ctx context.Context , nimCache * appsv1alpha1.NIMCache ) (requeue bool , err error ) {
499
499
logger := r .GetLogger ()
500
500
501
- // reconcile model selection pod
502
- if isModelSelectionRequired (nimCache ) && ! isModelSelectionDone (nimCache ) {
503
- // Create a temporary pod for parsing model manifest
504
- pod := constructPodSpec (nimCache )
505
- // Add nimCache as owner for watching on status change
506
- if err := controllerutil .SetControllerReference (nimCache , pod , r .GetScheme ()); err != nil {
507
- return false , err
508
- }
509
- err := r .createPod (ctx , pod )
510
- if err != nil {
511
- logger .Error (err , "failed to create" , "pod" , pod .Name )
512
- return false , err
513
- }
501
+ // Model manifest is available only for NGC model pullers
502
+ if nimCache .Spec .Source .NGC == nil {
503
+ return false , nil
504
+ }
514
505
515
- existingPod := & corev1.Pod {}
516
- err = r .Get (ctx , client.ObjectKey {Name : pod .Name , Namespace : nimCache .Namespace }, existingPod )
517
- if err != nil {
518
- logger .Error (err , "failed to get pod for model selection" , "pod" , pod .Name )
519
- return false , err
520
- }
506
+ existingConfig := & corev1.ConfigMap {}
507
+ cmName := getManifestConfigName (nimCache )
508
+ err = r .Get (ctx , client.ObjectKey {Name : cmName , Namespace : nimCache .Namespace }, existingConfig )
509
+ if err != nil && client .IgnoreNotFound (err ) != nil {
510
+ logger .Error (err , "failed to get configmap of the model manifest" , "name" , cmName )
511
+ return false , err
512
+ }
521
513
522
- if existingPod . Status . Phase != corev1 . PodRunning {
523
- // requeue request with delay until the pod is ready
524
- return true , nil
525
- }
514
+ // No action if the configmap is already created
515
+ if err == nil {
516
+ return false , nil
517
+ }
526
518
527
- // Extract manifest file
528
- output , err := r .getPodLogs (ctx , existingPod )
529
- if err != nil {
530
- logger .Error (err , "failed to get pod logs for parsing model manifest file" , "pod" , pod .Name )
531
- return false , err
532
- }
519
+ // Create a configmap by extracting the model manifest
520
+ // Create a temporary pod for parsing model manifest
521
+ pod := constructPodSpec (nimCache )
522
+ // Add nimCache as owner for watching on status change
523
+ if err := controllerutil .SetControllerReference (nimCache , pod , r .GetScheme ()); err != nil {
524
+ return false , err
525
+ }
526
+ err = r .createPod (ctx , pod )
527
+ if err != nil {
528
+ logger .Error (err , "failed to create" , "pod" , pod .Name )
529
+ return false , err
530
+ }
533
531
534
- // Parse the file
535
- manifest , err := nimparser .ParseModelManifestFromRawOutput ([]byte (output ))
536
- if err != nil {
537
- logger .Error (err , "Failed to parse model manifest from the pod" )
538
- return false , err
539
- }
540
- logger .V (2 ).Info ("manifest file" , "nimcache" , nimCache .Name , "manifest" , manifest )
532
+ existingPod := & corev1.Pod {}
533
+ err = r .Get (ctx , client.ObjectKey {Name : pod .Name , Namespace : nimCache .Namespace }, existingPod )
534
+ if err != nil {
535
+ logger .Error (err , "failed to get pod for model selection" , "pod" , pod .Name )
536
+ return false , err
537
+ }
541
538
542
- // Create a ConfigMap with the model manifest file for re-use
543
- err = r .createManifestConfigMap (ctx , nimCache , manifest )
544
- if err != nil {
545
- logger .Error (err , "Failed to create model manifest config map" )
546
- return false , err
547
- }
539
+ if existingPod .Status .Phase != corev1 .PodRunning {
540
+ // requeue request with delay until the pod is ready
541
+ return true , nil
542
+ }
548
543
544
+ // Extract manifest file
545
+ output , err := r .getPodLogs (ctx , existingPod )
546
+ if err != nil {
547
+ logger .Error (err , "failed to get pod logs for parsing model manifest file" , "pod" , pod .Name )
548
+ return false , err
549
+ }
550
+
551
+ // Parse the file
552
+ manifest , err := nimparser .ParseModelManifestFromRawOutput ([]byte (output ))
553
+ if err != nil {
554
+ logger .Error (err , "Failed to parse model manifest from the pod" )
555
+ return false , err
556
+ }
557
+ logger .V (2 ).Info ("manifest file" , "nimcache" , nimCache .Name , "manifest" , manifest )
558
+
559
+ // Create a ConfigMap with the model manifest file for re-use
560
+ err = r .createManifestConfigMap (ctx , nimCache , manifest )
561
+ if err != nil {
562
+ logger .Error (err , "Failed to create model manifest config map" )
563
+ return false , err
564
+ }
565
+
566
+ // Model manifest is successfully extracted, cleanup temporary pod
567
+ err = r .Delete (ctx , existingPod )
568
+ if err != nil && ! errors .IsNotFound (err ) {
569
+ logger .Error (err , "failed to delete" , "pod" , pod .Name )
570
+ // requeue request with delay until the pod is cleaned up
571
+ // this is required as NIM containers are resource heavy
572
+ return true , err
573
+ }
574
+ return false , nil
575
+ }
576
+
577
+ func (r * NIMCacheReconciler ) reconcileModelSelection (ctx context.Context , nimCache * appsv1alpha1.NIMCache ) error {
578
+ logger := r .GetLogger ()
579
+
580
+ // reconcile model selection pod
581
+ if isModelSelectionRequired (nimCache ) && ! isModelSelectionDone (nimCache ) {
549
582
var discoveredGPUs []string
550
583
// If no specific GPUs are provided, then auto-detect GPUs in the cluster for profile selection
551
584
if len (nimCache .Spec .Source .NGC .Model .GPUs ) == 0 {
552
585
gpusByNode , err := r .GetNodeGPUProducts (ctx )
553
586
if err != nil {
554
587
logger .Error (err , "Failed to get gpus in the cluster" )
555
- return false , err
588
+ return err
556
589
}
557
590
discoveredGPUs = getUniqueGPUProducts (gpusByNode )
558
591
}
559
592
593
+ // Get the model manifest from the config
594
+ nimManifest , err := r .extractNIMManifest (ctx , getManifestConfigName (nimCache ), nimCache .GetNamespace ())
595
+ if err != nil {
596
+ return fmt .Errorf ("failed to get model manifest config file: %w" , err )
597
+ }
598
+
560
599
// Match profiles with user input
561
- profiles , err := nimparser .MatchProfiles (nimCache .Spec .Source .NGC .Model , * manifest , discoveredGPUs )
600
+ profiles , err := nimparser .MatchProfiles (nimCache .Spec .Source .NGC .Model , * nimManifest , discoveredGPUs )
562
601
if err != nil {
563
602
logger .Error (err , "Failed to match profiles for given model parameters" )
564
- return false , err
603
+ return err
565
604
}
566
605
567
606
// Add the annotation to the NIMCache object
@@ -572,25 +611,16 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
572
611
profilesJSON , err := json .Marshal (profiles )
573
612
if err != nil {
574
613
logger .Error (err , "unable to marshal profiles to JSON" )
575
- return false , err
614
+ return err
576
615
}
577
616
578
617
nimCache .Annotations [SelectedNIMProfilesAnnotationKey ] = string (profilesJSON )
579
618
if err := r .Update (ctx , nimCache ); err != nil {
580
619
logger .Error (err , "unable to update NIMCache with selected profiles annotation" )
581
- return false , err
582
- }
583
-
584
- // Selected profiles updated, cleanup temporary pod
585
- err = r .Delete (ctx , existingPod )
586
- if err != nil && ! errors .IsNotFound (err ) {
587
- logger .Error (err , "failed to delete" , "pod" , pod .Name )
588
- // requeue request with delay until the pod is cleaned up
589
- // this is required as NIM containers are resource heavy
590
- return true , err
620
+ return err
591
621
}
592
622
}
593
- return false , nil
623
+ return nil
594
624
}
595
625
596
626
func (r * NIMCacheReconciler ) reconcileJob (ctx context.Context , nimCache * appsv1alpha1.NIMCache ) error {
@@ -755,10 +785,9 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap
755
785
return ctrl.Result {}, err
756
786
}
757
787
758
- // Reconcile NIM model selection
759
- requeue , err := r .reconcileModelSelection (ctx , nimCache )
788
+ requeue , err := r .reconcileModelManifest (ctx , nimCache )
760
789
if err != nil {
761
- logger .Error (err , "reconciliation of model selection failed" , "pod" , getPodName (nimCache ))
790
+ logger .Error (err , "reconciliation to extract model manifest failed" , "pod" , getPodName (nimCache ))
762
791
return ctrl.Result {}, err
763
792
}
764
793
@@ -767,6 +796,13 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap
767
796
return ctrl.Result {RequeueAfter : time .Second * 30 }, err
768
797
}
769
798
799
+ // Reconcile NIM model selection
800
+ err = r .reconcileModelSelection (ctx , nimCache )
801
+ if err != nil {
802
+ logger .Error (err , "reconciliation of model selection failed" )
803
+ return ctrl.Result {}, err
804
+ }
805
+
770
806
// Reconcile caching Job
771
807
err = r .reconcileJob (ctx , nimCache )
772
808
if err != nil {
0 commit comments