Skip to content

Commit

Permalink
[Chore][Log] Delete error loggings right before returned errors
Browse files Browse the repository at this point in the history
Signed-off-by: Chi-Sheng Liu <[email protected]>
  • Loading branch information
MortalHappiness committed May 1, 2024
1 parent 4836d01 commit 6cdc87d
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 46 deletions.
15 changes: 0 additions & 15 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
"finalizer", utils.GCSFaultToleranceRedisCleanupFinalizer)
controllerutil.AddFinalizer(instance, utils.GCSFaultToleranceRedisCleanupFinalizer)
if err := r.Update(ctx, instance); err != nil {
logger.Error(err, fmt.Sprintf("Failed to add the finalizer %s to the RayCluster.", utils.GCSFaultToleranceRedisCleanupFinalizer))
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
}
// Only start the RayCluster reconciliation after the finalizer is added.
Expand Down Expand Up @@ -287,7 +286,6 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
logger.Info(fmt.Sprintf("Redis cleanup Job already exists. Requeue the RayCluster CR %s.", instance.Name))
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, nil
}
logger.Error(err, "Failed to create Redis cleanup Job")
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
}
logger.Info("Successfully created Redis cleanup Job", "Job name", redisCleanupJob.Name)
Expand Down Expand Up @@ -439,7 +437,6 @@ func (r *RayClusterReconciler) reconcileRouteOpenShift(ctx context.Context, inst
headRoutes := routev1.RouteList{}
filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name}
if err := r.List(ctx, &headRoutes, client.InNamespace(instance.Namespace), filterLabels); err != nil {
logger.Error(err, "Route Listing error!", "Route.Error", err)
return err
}

Expand All @@ -451,7 +448,6 @@ func (r *RayClusterReconciler) reconcileRouteOpenShift(ctx context.Context, inst
if headRoutes.Items == nil || len(headRoutes.Items) == 0 {
route, err := common.BuildRouteForHeadService(*instance)
if err != nil {
logger.Error(err, "Failed building route!", "Route.Error", err)
return err
}

Expand All @@ -461,7 +457,6 @@ func (r *RayClusterReconciler) reconcileRouteOpenShift(ctx context.Context, inst

err = r.createHeadRoute(ctx, route, instance)
if err != nil {
logger.Error(err, "Failed creating route!", "Route.Error", err)
return err
}
}
Expand Down Expand Up @@ -925,7 +920,6 @@ func (r *RayClusterReconciler) createHeadIngress(ctx context.Context, ingress *n
logger.Info("Ingress already exists, no need to create")
return nil
}
logger.Error(err, "Ingress create error!", "Ingress.Error", err)
return err
}
logger.Info("Ingress created successfully", "ingress name", ingress.Name)
Expand All @@ -944,7 +938,6 @@ func (r *RayClusterReconciler) createHeadRoute(ctx context.Context, route *route
logger.Info("Route already exists, no need to create")
return nil
}
logger.Error(err, "Route create error!", "Route.Error", err)
return err
}
logger.Info("Route created successfully", "route name", route.Name)
Expand All @@ -967,7 +960,6 @@ func (r *RayClusterReconciler) createService(ctx context.Context, raySvc *corev1
logger.Info("Pod service already exist, no need to create")
return nil
}
logger.Error(err, "Pod Service create error!", "Pod.Service.Error", err)
return err
}
logger.Info("Pod Service created successfully", "service name", raySvc.Name)
Expand Down Expand Up @@ -999,7 +991,6 @@ func (r *RayClusterReconciler) createHeadPod(ctx context.Context, instance rayv1
// the pod might be in terminating state, we need to check
if errPod := r.Get(ctx, podIdentifier, &fetchedPod); errPod == nil {
if fetchedPod.DeletionTimestamp != nil {
logger.Error(errPod, "create pod error!", "pod is in a terminating state, we will wait until it is cleaned up", podIdentifier)
return err
}
}
Expand Down Expand Up @@ -1036,13 +1027,11 @@ func (r *RayClusterReconciler) createWorkerPod(ctx context.Context, instance ray
// the pod might be in terminating state, we need to check
if errPod := r.Get(ctx, podIdentifier, &fetchedPod); errPod == nil {
if fetchedPod.DeletionTimestamp != nil {
logger.Error(errPod, "create pod error!", "pod is in a terminating state, we will wait until it is cleaned up", podIdentifier)
return err
}
}
logger.Info("Creating pod", "Pod already exists", pod.Name)
} else {
logger.Error(fmt.Errorf("createWorkerPod error"), "error creating pod", "pod", pod, "err = ", err)
return err
}
}
Expand Down Expand Up @@ -1262,7 +1251,6 @@ func (r *RayClusterReconciler) getHeadPodIP(ctx context.Context, instance *rayv1
runtimePods := corev1.PodList{}
filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)}
if err := r.List(ctx, &runtimePods, client.InNamespace(instance.Namespace), filterLabels); err != nil {
logger.Error(err, "Failed to list pods while getting head pod ip.")
return "", err
}
if len(runtimePods.Items) != 1 {
Expand Down Expand Up @@ -1392,7 +1380,6 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con
logger.Info("Pod service account already exist, no need to create")
return nil
}
logger.Error(err, "Pod Service Account create error!", "Pod.ServiceAccount.Error", err)
return err
}
logger.Info("Pod ServiceAccount created successfully", "service account name", serviceAccount.Name)
Expand Down Expand Up @@ -1434,7 +1421,6 @@ func (r *RayClusterReconciler) reconcileAutoscalerRole(ctx context.Context, inst
logger.Info("role already exist, no need to create")
return nil
}
logger.Error(err, "Role create error!", "Role.Error", err)
return err
}
logger.Info("Role created successfully", "role name", role.Name)
Expand Down Expand Up @@ -1476,7 +1462,6 @@ func (r *RayClusterReconciler) reconcileAutoscalerRoleBinding(ctx context.Contex
logger.Info("role binding already exist, no need to create")
return nil
}
logger.Error(err, "Role binding create error!", "RoleBinding.Error", err)
return err
}
logger.Info("RoleBinding created successfully", "role binding name", roleBinding.Name)
Expand Down
10 changes: 0 additions & 10 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,14 +357,10 @@ func (r *RayJobReconciler) createK8sJobIfNeed(ctx context.Context, rayJobInstanc
if errors.IsNotFound(err) {
submitterTemplate, err := r.getSubmitterTemplate(ctx, rayJobInstance, rayClusterInstance)
if err != nil {
logger.Error(err, "failed to get submitter template")
return err
}
return r.createNewK8sJob(ctx, rayJobInstance, submitterTemplate)
}

// Some other error occurred while trying to get the Job
logger.Error(err, "failed to get Kubernetes Job")
return err
}

Expand Down Expand Up @@ -444,13 +440,11 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance *

// Set the ownership in order to do the garbage collection by k8s.
if err := ctrl.SetControllerReference(rayJobInstance, job, r.Scheme); err != nil {
logger.Error(err, "failed to set controller reference")
return err
}

// Create the Kubernetes Job
if err := r.Client.Create(ctx, job); err != nil {
logger.Error(err, "failed to create k8s Job")
return err
}
logger.Info("Kubernetes Job created", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name)
Expand All @@ -476,7 +470,6 @@ func (r *RayJobReconciler) deleteSubmitterJob(ctx context.Context, rayJobInstanc
isJobDeleted = true
logger.Info("The submitter Kubernetes Job has been already deleted", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name)
} else {
logger.Error(err, "Failed to get Kubernetes Job")
return false, err
}
} else {
Expand Down Expand Up @@ -629,16 +622,13 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
logger.Info("RayCluster not found, creating RayCluster!", "RayCluster", rayClusterNamespacedName)
rayClusterInstance, err = r.constructRayClusterForRayJob(rayJobInstance, rayClusterNamespacedName.Name)
if err != nil {
logger.Error(err, "unable to construct a new RayCluster")
return nil, err
}
if err := r.Create(ctx, rayClusterInstance); err != nil {
logger.Error(err, "unable to create RayCluster for RayJob", "RayCluster", rayClusterInstance)
return nil, err
}
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, "Created", "Created RayCluster %s", rayJobInstance.Status.RayClusterName)
} else {
logger.Error(err, "Fail to get RayCluster!")
return nil, err
}
}
Expand Down
17 changes: 0 additions & 17 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,8 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
}

func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceInstance *rayv1.RayService) error {
logger := ctrl.LoggerFrom(ctx)
serveEndPoints := &corev1.Endpoints{}
if err := r.Get(ctx, common.RayServiceServeServiceNamespacedName(rayServiceInstance), serveEndPoints); err != nil && !errors.IsNotFound(err) {
logger.Error(err, "Fail to retrieve the Kubernetes Endpoints from the cluster!")
return err
}

Expand Down Expand Up @@ -438,7 +436,6 @@ func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, ra

var err error
if err = r.List(ctx, &rayClusterList, common.RayServiceRayClustersAssociationOptions(rayServiceInstance).ToListOptions()...); err != nil {
logger.Error(err, "Fail to list RayCluster for "+rayServiceInstance.Name)
return err
}

Expand All @@ -463,7 +460,6 @@ func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, ra
if reasonForDeletion != "" {
logger.Info("reconcileRayCluster", "delete Ray cluster", rayClusterInstance.Name, "reason", reasonForDeletion)
if err := r.Delete(ctx, &rayClusterInstance, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil {
logger.Error(err, "Fail to delete RayCluster "+rayClusterInstance.Name)
return err
}
}
Expand All @@ -475,12 +471,10 @@ func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, ra
}

func (r *RayServiceReconciler) getRayClusterByNamespacedName(ctx context.Context, clusterKey client.ObjectKey) (*rayv1.RayCluster, error) {
logger := ctrl.LoggerFrom(ctx)
rayCluster := &rayv1.RayCluster{}
if clusterKey.Name != "" {
// Ignore not found since in that case we should return RayCluster as nil.
if err := r.Get(ctx, clusterKey, rayCluster); client.IgnoreNotFound(err) != nil {
logger.Error(err, "Fail to get RayCluster "+clusterKey.String())
return nil, err
}
} else {
Expand Down Expand Up @@ -603,7 +597,6 @@ func (r *RayServiceReconciler) createRayClusterInstanceIfNeeded(ctx context.Cont
} else {
clusterAction, err = getClusterAction(pendingRayCluster.Spec, rayServiceInstance.Spec.RayClusterSpec)
if err != nil {
logger.Error(err, "Fail to generate hash for RayClusterSpec")
return nil, err
}
}
Expand Down Expand Up @@ -640,7 +633,6 @@ func (r *RayServiceReconciler) updateRayClusterInstance(ctx context.Context, ray
Name: rayClusterInstance.Name,
})
if err != nil {
logger.Error(err, "Failed to get the current state of RayCluster", "Namespace", rayClusterInstance.Namespace, "Name", rayClusterInstance.Name)
return err
}

Expand All @@ -658,7 +650,6 @@ func (r *RayServiceReconciler) updateRayClusterInstance(ctx context.Context, ray

// Update the RayCluster
if err = r.Update(ctx, currentRayCluster); err != nil {
logger.Error(err, "Fail to update RayCluster "+currentRayCluster.Name)
return err
}

Expand Down Expand Up @@ -692,19 +683,16 @@ func (r *RayServiceReconciler) createRayClusterInstance(ctx context.Context, ray
}
// if error is `not found`, then continue.
} else if !errors.IsNotFound(err) {
logger.Error(err, "Get request rayCluster instance error!")
return nil, err
// if error is `not found`, then continue.
}

logger.Info("No pending RayCluster, creating RayCluster.")
rayClusterInstance, err = r.constructRayClusterForRayService(ctx, rayServiceInstance, rayClusterKey.Name)
if err != nil {
logger.Error(err, "unable to construct rayCluster from spec")
return nil, err
}
if err = r.Create(ctx, rayClusterInstance); err != nil {
logger.Error(err, "unable to create rayCluster for rayService", "rayCluster", rayClusterInstance)
return nil, err
}
logger.Info("created rayCluster for rayService", "rayCluster", rayClusterInstance)
Expand Down Expand Up @@ -991,7 +979,6 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService
oldSvc.Spec = *newSvc.Spec.DeepCopy()
logger.Info(fmt.Sprintf("Update Kubernetes Service serviceType %v", serviceType))
if updateErr := r.Update(ctx, oldSvc); updateErr != nil {
logger.Error(updateErr, fmt.Sprintf("Fail to update Kubernetes Service serviceType %v", serviceType), "Error", updateErr)
return updateErr
}
} else if errors.IsNotFound(err) {
Expand All @@ -1004,11 +991,9 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService
logger.Info("The Kubernetes Service already exists, no need to create.")
return nil
}
logger.Error(createErr, fmt.Sprintf("Fail to create Kubernetes Service serviceType %v", serviceType), "Error", createErr)
return createErr
}
} else {
logger.Error(err, "Fail to retrieve the Kubernetes Service from the cluster!")
return err
}

Expand Down Expand Up @@ -1120,7 +1105,6 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
}

func (r *RayServiceReconciler) labelHeadPodForServeStatus(ctx context.Context, rayClusterInstance *rayv1.RayCluster) error {
logger := ctrl.LoggerFrom(ctx)
headPod, err := r.getHeadPod(ctx, rayClusterInstance)
if err != nil {
return err
Expand Down Expand Up @@ -1151,7 +1135,6 @@ func (r *RayServiceReconciler) labelHeadPodForServeStatus(ctx context.Context, r

if !reflect.DeepEqual(originalLabels, headPod.Labels) {
if updateErr := r.Update(ctx, headPod); updateErr != nil {
logger.Error(updateErr, "Pod label Update error!", "Pod.Error", updateErr)
return updateErr
}
}
Expand Down
5 changes: 1 addition & 4 deletions ray-operator/controllers/ray/utils/httpproxy_httpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,15 @@ func (r *RayHttpProxyClient) SetHostIp(hostIp, podNamespace, podName string, por

// CheckProxyActorHealth checks the health status of the Ray Serve proxy actor.
func (r *RayHttpProxyClient) CheckProxyActorHealth(ctx context.Context) error {
logger := ctrl.LoggerFrom(ctx)
resp, err := r.client.Get(r.httpProxyURL + RayServeProxyHealthPath)
if err != nil {
logger.Error(err, "CheckProxyActorHealth fails.")
return err
}
defer resp.Body.Close()

body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != 200 {
err := fmt.Errorf("CheckProxyActorHealth fails: Status code is not 200")
logger.Error(err, "CheckProxyActorHealth fails.", "status code", resp.StatusCode, "status", resp.Status, "body", string(body))
err := fmt.Errorf("CheckProxyActorHealth fails. status code: %d, status: %s, body: %s", resp.StatusCode, resp.Status, string(body))
return err
}

Expand Down

0 comments on commit 6cdc87d

Please sign in to comment.