From 6142868faf3041b5268d827fc163e157f9e69986 Mon Sep 17 00:00:00 2001 From: catpineapple Date: Thu, 21 Nov 2024 15:30:26 +0800 Subject: [PATCH 1/4] add rolling restart --- api/doris/v1/doriscluster_util.go | 6 ++ config/crd/bases/crds.yaml | 8 +++ .../doris.selectdb.com_dorisclusters.yaml | 8 +++ doc/api.md | 17 ++++- doc/disaggregated_api.md | 2 +- .../doris.selectdb.com_dorisclusters.yaml | 8 +++ .../sub_controller/be/controller.go | 9 +++ .../sub_controller/be/prepare_modify.go | 39 ++++++++++++ pkg/controller/sub_controller/events.go | 2 + .../sub_controller/fe/controller.go | 6 +- .../sub_controller/fe/prepare_modify.go | 12 +++- .../sub_controller/sub_controller.go | 62 ++++++++++++++++++- 12 files changed, 173 insertions(+), 6 deletions(-) create mode 100644 pkg/controller/sub_controller/be/prepare_modify.go diff --git a/api/doris/v1/doriscluster_util.go b/api/doris/v1/doriscluster_util.go index 3ecfc74c..cfe9759b 100644 --- a/api/doris/v1/doriscluster_util.go +++ b/api/doris/v1/doriscluster_util.go @@ -43,6 +43,12 @@ const ( OwnerReference string = "app.doris.ownerreference/name" ServiceRoleForCluster string = "app.doris.service/role" + + FERestartAt string = "apache.doris.fe/restartedAt" + BERestartAt string = "apache.doris.be/restartedAt" + CNRestartAt string = "apache.doris.cn/restartedAt" + BrokerRestartAt string = "apache.doris.broker/restartedAt" + DorisRollingRestartAt string = "apache.doris.org/restartedAt" ) type ServiceRole string diff --git a/config/crd/bases/crds.yaml b/config/crd/bases/crds.yaml index 147a9ef6..4cd2c6eb 100644 --- a/config/crd/bases/crds.yaml +++ b/config/crd/bases/crds.yaml @@ -1094,6 +1094,14 @@ spec: type: string type: object type: object + enableWorkloadGroup: + description: |- + EnableWorkloadGroup is a switch that determines whether the doris cluster enables the workload group. + Default value is 'false'. + Enabling it means that the container must be started in privileged mode. + Please confirm whether the host machine and k8s cluster allow it. + Doris workloadgroup reference document: https://doris.apache.org/docs/admin-manual/resource-admin/workload-group + type: boolean envVars: description: cnEnvVars is a slice of environment variables that are added to the pods, the default is empty. diff --git a/config/crd/bases/doris.selectdb.com_dorisclusters.yaml b/config/crd/bases/doris.selectdb.com_dorisclusters.yaml index 6ba9a760..d7a17403 100644 --- a/config/crd/bases/doris.selectdb.com_dorisclusters.yaml +++ b/config/crd/bases/doris.selectdb.com_dorisclusters.yaml @@ -1094,6 +1094,14 @@ spec: type: string type: object type: object + enableWorkloadGroup: + description: |- + EnableWorkloadGroup is a switch that determines whether the doris cluster enables the workload group. + Default value is 'false'. + Enabling it means that the container must be started in privileged mode. + Please confirm whether the host machine and k8s cluster allow it. + Doris workloadgroup reference document: https://doris.apache.org/docs/admin-manual/resource-admin/workload-group + type: boolean envVars: description: cnEnvVars is a slice of environment variables that are added to the pods, the default is empty. diff --git a/doc/api.md b/doc/api.md index 917fcf63..5adda542 100644 --- a/doc/api.md +++ b/doc/api.md @@ -617,6 +617,21 @@ BaseSpec

the foundation spec for creating be software services.

+ + +enableWorkloadGroup
+ +bool + + + +

EnableWorkloadGroup is a switch that determines whether the doris cluster enables the workload group. +Default value is ‘false’. +Enabling it means that the container must be started in privileged mode. +Please confirm whether the host machine and k8s cluster allow it. +Doris workloadgroup reference document: https://doris.apache.org/docs/admin-manual/resource-admin/workload-group

+ +

BrokerSpec @@ -2604,5 +2619,5 @@ string

Generated with gen-crd-api-reference-docs -on git commit 54d1acc. +on git commit 1a70fe9.

diff --git a/doc/disaggregated_api.md b/doc/disaggregated_api.md index 7a7453c3..4b63b9b0 100644 --- a/doc/disaggregated_api.md +++ b/doc/disaggregated_api.md @@ -1441,5 +1441,5 @@ string

Generated with gen-crd-api-reference-docs -on git commit 54d1acc. +on git commit 7d27da0.

diff --git a/helm-charts/doris-operator/crds/doris.selectdb.com_dorisclusters.yaml b/helm-charts/doris-operator/crds/doris.selectdb.com_dorisclusters.yaml index 6ba9a760..d7a17403 100644 --- a/helm-charts/doris-operator/crds/doris.selectdb.com_dorisclusters.yaml +++ b/helm-charts/doris-operator/crds/doris.selectdb.com_dorisclusters.yaml @@ -1094,6 +1094,14 @@ spec: type: string type: object type: object + enableWorkloadGroup: + description: |- + EnableWorkloadGroup is a switch that determines whether the doris cluster enables the workload group. + Default value is 'false'. + Enabling it means that the container must be started in privileged mode. + Please confirm whether the host machine and k8s cluster allow it. + Doris workloadgroup reference document: https://doris.apache.org/docs/admin-manual/resource-admin/workload-group + type: boolean envVars: description: cnEnvVars is a slice of environment variables that are added to the pods, the default is empty. diff --git a/pkg/controller/sub_controller/be/controller.go b/pkg/controller/sub_controller/be/controller.go index 22c171d5..1dc4f2b3 100644 --- a/pkg/controller/sub_controller/be/controller.go +++ b/pkg/controller/sub_controller/be/controller.go @@ -54,6 +54,11 @@ func (be *Controller) Sync(ctx context.Context, dcr *v1.DorisCluster) error { if dcr.Spec.BeSpec == nil { return nil } + + var oldStatus v1.ComponentStatus + if dcr.Status.BEStatus != nil { + oldStatus = *(dcr.Status.BEStatus.DeepCopy()) + } be.InitStatus(dcr, v1.Component_BE) if !be.FeAvailable(dcr) { return nil @@ -83,6 +88,10 @@ func (be *Controller) Sync(ctx context.Context, dcr *v1.DorisCluster) error { return err } + if err = be.prepareStatefulsetApply(ctx, dcr, oldStatus); err != nil { + return err + } + st := be.buildBEStatefulSet(dcr) if !be.PrepareReconcileResources(ctx, dcr, v1.Component_BE) { klog.Infof("be controller sync preparing resource for reconciling namespace %s name %s!", dcr.Namespace, dcr.Name) diff --git a/pkg/controller/sub_controller/be/prepare_modify.go b/pkg/controller/sub_controller/be/prepare_modify.go new file mode 100644 index 00000000..dc606e66 --- /dev/null +++ b/pkg/controller/sub_controller/be/prepare_modify.go @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package be + +import ( + "context" + v1 "github.com/apache/doris-operator/api/doris/v1" +) + +// prepareStatefulsetApply means Pre-operation and status control on the client side +func (be *Controller) prepareStatefulsetApply(ctx context.Context, dcr *v1.DorisCluster, oldStatus v1.ComponentStatus) error { + + // be rolling restart + // check 1: be Phase is Available + // check 2: be RestartTime is not empty and useful + // check 3: be RestartTime different from old(This condition does not need to be checked here. If it is allowed to pass, it will be processed idempotent when applying sts.) + if oldStatus.ComponentCondition.Phase == v1.Available && be.CheckRestartTimeAndInject(dcr, v1.Component_BE) { + dcr.Status.BEStatus.ComponentCondition.Phase = v1.Restarting + } + + //TODO check upgrade + + return nil +} diff --git a/pkg/controller/sub_controller/events.go b/pkg/controller/sub_controller/events.go index d890adb3..24315bce 100644 --- a/pkg/controller/sub_controller/events.go +++ b/pkg/controller/sub_controller/events.go @@ -70,6 +70,8 @@ var ( MSServiceDeletedFailed EventReason = "MSServiceDeletedFailed" MSStatefulsetDeleteFailed EventReason = "MSStatefulsetDeleteFailed" FDBAddressNotConfiged EventReason = "FDBAddressNotConfiged" + RestartParamIllegal EventReason = "RestartParamIllegal" + RollingRestart EventReason = "BERestarting" ) type Event struct { diff --git a/pkg/controller/sub_controller/fe/controller.go b/pkg/controller/sub_controller/fe/controller.go index 347dcf03..b816f470 100644 --- a/pkg/controller/sub_controller/fe/controller.go +++ b/pkg/controller/sub_controller/fe/controller.go @@ -80,6 +80,10 @@ func (fc *Controller) Sync(ctx context.Context, cluster *v1.DorisCluster) error klog.Info("fe Controller Sync ", "the fe component is not needed ", "namespace ", cluster.Namespace, " doris cluster name ", cluster.Name) return nil } + var oldStatus v1.ComponentStatus + if cluster.Status.FEStatus != nil { + oldStatus = *(cluster.Status.FEStatus.DeepCopy()) + } fc.InitStatus(cluster, v1.Component_FE) feSpec := cluster.Spec.FeSpec @@ -111,7 +115,7 @@ func (fc *Controller) Sync(ctx context.Context, cluster *v1.DorisCluster) error return nil } - if err = fc.prepareStatefulsetApply(ctx, cluster); err != nil { + if err = fc.prepareStatefulsetApply(ctx, cluster, oldStatus); err != nil { return err } diff --git a/pkg/controller/sub_controller/fe/prepare_modify.go b/pkg/controller/sub_controller/fe/prepare_modify.go index c484c5dd..715c01c0 100644 --- a/pkg/controller/sub_controller/fe/prepare_modify.go +++ b/pkg/controller/sub_controller/fe/prepare_modify.go @@ -33,7 +33,7 @@ import ( ) // prepareStatefulsetApply means Pre-operation and status control on the client side -func (fc *Controller) prepareStatefulsetApply(ctx context.Context, cluster *v1.DorisCluster) error { +func (fc *Controller) prepareStatefulsetApply(ctx context.Context, cluster *v1.DorisCluster, oldStatus v1.ComponentStatus) error { var oldSt appv1.StatefulSet err := fc.K8sclient.Get(ctx, types.NamespacedName{Namespace: cluster.Namespace, Name: v1.GenerateComponentStatefulSetName(cluster, v1.Component_FE)}, &oldSt) if err != nil { @@ -63,7 +63,15 @@ func (fc *Controller) prepareStatefulsetApply(ctx context.Context, cluster *v1.D return nil } - //TODO check upgrade ,restart + // fe rolling restart + // check 1: fe Phase is Available + // check 2: fe RestartTime is not empty and useful + // check 3: fe RestartTime different from old(This condition does not need to be checked here. If it is allowed to pass, it will be processed idempotent when applying sts.) + if oldStatus.ComponentCondition.Phase == v1.Available && fc.CheckRestartTimeAndInject(cluster, v1.Component_FE) { + cluster.Status.FEStatus.ComponentCondition.Phase = v1.Restarting + } + + //TODO check upgrade return nil } diff --git a/pkg/controller/sub_controller/sub_controller.go b/pkg/controller/sub_controller/sub_controller.go index 91385939..84d84c82 100644 --- a/pkg/controller/sub_controller/sub_controller.go +++ b/pkg/controller/sub_controller/sub_controller.go @@ -56,6 +56,59 @@ type SubDefaultController struct { K8srecorder record.EventRecorder } +func (d *SubDefaultController) CheckRestartTimeAndInject(dcr *dorisv1.DorisCluster, componentType dorisv1.ComponentType) bool { + var baseSpec *dorisv1.BaseSpec + var restartedAt string + switch componentType { + case dorisv1.Component_FE: + baseSpec = &dcr.Spec.FeSpec.BaseSpec + restartedAt = dcr.Annotations[dorisv1.FERestartAt] + case dorisv1.Component_BE: + baseSpec = &dcr.Spec.BeSpec.BaseSpec + restartedAt = dcr.Annotations[dorisv1.BERestartAt] + default: + klog.Errorf("CheckRestartTimeAndInject dorisClusterName %s, namespace %s componentType %s not supported.", dcr.Name, dcr.Namespace, componentType) + } + + if restartedAt == "" { + return false + } + + // run shell: date +"%Y-%m-%dT%H:%M:%S%:z" + // "2024-11-21T11:08:56+08:00" + parseTime, err := time.Parse(time.RFC3339, restartedAt) + if err != nil { + checkErr := fmt.Errorf("CheckRestartTimeAndInject error: time format is incorrect. dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s , error: %s", dcr.Name, dcr.Namespace, componentType, restartedAt, err.Error()) + klog.Error(checkErr.Error()) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParamIllegal), checkErr.Error()) + return false + } + + restartAt := time.Now().Add(-10 * time.Minute) + + if restartAt.After(parseTime) { + checkErr := fmt.Errorf("CheckRestartTimeAndInject The time has expired, dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s : The time has expired, if you want to restart doris, please set a future time", dcr.Name, dcr.Namespace, componentType, restartedAt) + klog.Error(checkErr.Error()) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParamIllegal), checkErr.Error()) + return false + } + + klog.Infof("CheckRestartTime successed, DCR %s in namespace %s, will restart %s ", dcr.Name, dcr.Namespace, componentType) + d.K8srecorder.Event( + dcr, + string(EventNormal), + string(RollingRestart), + fmt.Sprintf("CheckRestartTime successed, DCR %s in namespace %s, restart %s ", dcr.Name, dcr.Namespace, componentType), + ) + + // check passed, set annotations to doriscluster baseSpec + if baseSpec.Annotations == nil { + baseSpec.Annotations = make(map[string]string) + } + baseSpec.Annotations[dorisv1.DorisRollingRestartAt] = restartedAt + return true +} + // UpdateStatus update the component status on src. func (d *SubDefaultController) UpdateStatus(namespace string, status *dorisv1.ComponentStatus, labels map[string]string, replicas int32) error { return d.ClassifyPodsByStatus(namespace, status, labels, replicas) @@ -68,9 +121,16 @@ func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *do } var creatings, readys, faileds []string + var firstStartAnnotation string podmap := make(map[string]corev1.Pod) + if len(podList.Items) > 0 { + firstStartAnnotation = podList.Items[0].Annotations[dorisv1.DorisRollingRestartAt] + } + //get all pod status that controlled by st. + stsRollingRestartAnnotationsSameCheck := true for _, pod := range podList.Items { + stsRollingRestartAnnotationsSameCheck = stsRollingRestartAnnotationsSameCheck && pod.Annotations[dorisv1.DorisRollingRestartAt] == firstStartAnnotation podmap[pod.Name] = pod if ready := k8s.PodIsReady(&pod.Status); ready { readys = append(readys, pod.Name) @@ -81,7 +141,7 @@ func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *do } } - if len(readys) == int(replicas) { + if len(readys) == int(replicas) && stsRollingRestartAnnotationsSameCheck { status.ComponentCondition.Phase = dorisv1.Available } else if len(faileds) != 0 { status.ComponentCondition.Phase = dorisv1.HaveMemberFailed From 820c6a80194fb1379522364d32716fc0a96f1714 Mon Sep 17 00:00:00 2001 From: catpineapple Date: Thu, 21 Nov 2024 15:50:50 +0800 Subject: [PATCH 2/4] add rolling restart --- api/doris/v1/doriscluster_util.go | 2 -- pkg/controller/sub_controller/sub_controller.go | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/api/doris/v1/doriscluster_util.go b/api/doris/v1/doriscluster_util.go index cfe9759b..013b3bb8 100644 --- a/api/doris/v1/doriscluster_util.go +++ b/api/doris/v1/doriscluster_util.go @@ -46,8 +46,6 @@ const ( FERestartAt string = "apache.doris.fe/restartedAt" BERestartAt string = "apache.doris.be/restartedAt" - CNRestartAt string = "apache.doris.cn/restartedAt" - BrokerRestartAt string = "apache.doris.broker/restartedAt" DorisRollingRestartAt string = "apache.doris.org/restartedAt" ) diff --git a/pkg/controller/sub_controller/sub_controller.go b/pkg/controller/sub_controller/sub_controller.go index 84d84c82..cf5f6979 100644 --- a/pkg/controller/sub_controller/sub_controller.go +++ b/pkg/controller/sub_controller/sub_controller.go @@ -121,16 +121,16 @@ func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *do } var creatings, readys, faileds []string - var firstStartAnnotation string + var firstRestartAnnotation string podmap := make(map[string]corev1.Pod) if len(podList.Items) > 0 { - firstStartAnnotation = podList.Items[0].Annotations[dorisv1.DorisRollingRestartAt] + firstRestartAnnotation = podList.Items[0].Annotations[dorisv1.DorisRollingRestartAt] } //get all pod status that controlled by st. stsRollingRestartAnnotationsSameCheck := true for _, pod := range podList.Items { - stsRollingRestartAnnotationsSameCheck = stsRollingRestartAnnotationsSameCheck && pod.Annotations[dorisv1.DorisRollingRestartAt] == firstStartAnnotation + stsRollingRestartAnnotationsSameCheck = stsRollingRestartAnnotationsSameCheck && pod.Annotations[dorisv1.DorisRollingRestartAt] == firstRestartAnnotation podmap[pod.Name] = pod if ready := k8s.PodIsReady(&pod.Status); ready { readys = append(readys, pod.Name) From 5e70cdb261244cd4b414ecd3f80787711ed6f197 Mon Sep 17 00:00:00 2001 From: catpineapple Date: Thu, 21 Nov 2024 17:35:01 +0800 Subject: [PATCH 3/4] add rolling restart --- api/doris/v1/doriscluster_util.go | 7 ++- .../sub_controller/be/controller.go | 2 +- .../sub_controller/broker/controller.go | 2 +- .../sub_controller/cn/controller.go | 2 +- pkg/controller/sub_controller/events.go | 4 +- .../sub_controller/fe/controller.go | 2 +- .../sub_controller/sub_controller.go | 45 +++++++++++-------- 7 files changed, 36 insertions(+), 28 deletions(-) diff --git a/api/doris/v1/doriscluster_util.go b/api/doris/v1/doriscluster_util.go index 013b3bb8..ecadd4e7 100644 --- a/api/doris/v1/doriscluster_util.go +++ b/api/doris/v1/doriscluster_util.go @@ -28,6 +28,9 @@ import ( const ( //ComponentsResourceHash the component hash ComponentResourceHash string = "app.doris.components/hash" + + FERestartAt string = "apache.doris.fe/restartedAt" + BERestartAt string = "apache.doris.be/restartedAt" ) // the labels key @@ -43,10 +46,6 @@ const ( OwnerReference string = "app.doris.ownerreference/name" ServiceRoleForCluster string = "app.doris.service/role" - - FERestartAt string = "apache.doris.fe/restartedAt" - BERestartAt string = "apache.doris.be/restartedAt" - DorisRollingRestartAt string = "apache.doris.org/restartedAt" ) type ServiceRole string diff --git a/pkg/controller/sub_controller/be/controller.go b/pkg/controller/sub_controller/be/controller.go index 1dc4f2b3..d6fcfa3f 100644 --- a/pkg/controller/sub_controller/be/controller.go +++ b/pkg/controller/sub_controller/be/controller.go @@ -118,7 +118,7 @@ func (be *Controller) UpdateComponentStatus(cluster *v1.DorisCluster) error { return nil } - return be.ClassifyPodsByStatus(cluster.Namespace, cluster.Status.BEStatus, v1.GenerateStatefulSetSelector(cluster, v1.Component_BE), *cluster.Spec.BeSpec.Replicas) + return be.ClassifyPodsByStatus(cluster.Namespace, cluster.Status.BEStatus, v1.GenerateStatefulSetSelector(cluster, v1.Component_BE), *cluster.Spec.BeSpec.Replicas, v1.Component_BE) } func (be *Controller) ClearResources(ctx context.Context, dcr *v1.DorisCluster) (bool, error) { diff --git a/pkg/controller/sub_controller/broker/controller.go b/pkg/controller/sub_controller/broker/controller.go index da90a17f..0e87a25a 100644 --- a/pkg/controller/sub_controller/broker/controller.go +++ b/pkg/controller/sub_controller/broker/controller.go @@ -113,7 +113,7 @@ func (bk *Controller) UpdateComponentStatus(cluster *v1.DorisCluster) error { cluster.Status.BrokerStatus = bs bs.AccessService = v1.GenerateExternalServiceName(cluster, v1.Component_Broker) - return bk.ClassifyPodsByStatus(cluster.Namespace, bs, v1.GenerateStatefulSetSelector(cluster, v1.Component_Broker), *cluster.Spec.BrokerSpec.Replicas) + return bk.ClassifyPodsByStatus(cluster.Namespace, bs, v1.GenerateStatefulSetSelector(cluster, v1.Component_Broker), *cluster.Spec.BrokerSpec.Replicas, v1.Component_Broker) } diff --git a/pkg/controller/sub_controller/cn/controller.go b/pkg/controller/sub_controller/cn/controller.go index 202b0c4f..27cfd936 100644 --- a/pkg/controller/sub_controller/cn/controller.go +++ b/pkg/controller/sub_controller/cn/controller.go @@ -145,7 +145,7 @@ func (cn *Controller) UpdateComponentStatus(cluster *dorisv1.DorisCluster) error replicas := *est.Spec.Replicas cs.AccessService = dorisv1.GenerateExternalServiceName(cluster, dorisv1.Component_CN) - return cn.ClassifyPodsByStatus(cluster.Namespace, &cs.ComponentStatus, dorisv1.GenerateStatefulSetSelector(cluster, dorisv1.Component_CN), replicas) + return cn.ClassifyPodsByStatus(cluster.Namespace, &cs.ComponentStatus, dorisv1.GenerateStatefulSetSelector(cluster, dorisv1.Component_CN), replicas, dorisv1.Component_CN) } // autoscaler represents start autoscaler or not. diff --git a/pkg/controller/sub_controller/events.go b/pkg/controller/sub_controller/events.go index 24315bce..b8b57c90 100644 --- a/pkg/controller/sub_controller/events.go +++ b/pkg/controller/sub_controller/events.go @@ -70,8 +70,8 @@ var ( MSServiceDeletedFailed EventReason = "MSServiceDeletedFailed" MSStatefulsetDeleteFailed EventReason = "MSStatefulsetDeleteFailed" FDBAddressNotConfiged EventReason = "FDBAddressNotConfiged" - RestartParamIllegal EventReason = "RestartParamIllegal" - RollingRestart EventReason = "BERestarting" + RestartParameterIllegal EventReason = "RestartParameterIllegal" + RollingRestart EventReason = "Restarting" ) type Event struct { diff --git a/pkg/controller/sub_controller/fe/controller.go b/pkg/controller/sub_controller/fe/controller.go index b816f470..1f9f586c 100644 --- a/pkg/controller/sub_controller/fe/controller.go +++ b/pkg/controller/sub_controller/fe/controller.go @@ -57,7 +57,7 @@ func (fc *Controller) UpdateComponentStatus(cluster *v1.DorisCluster) error { return nil } - return fc.ClassifyPodsByStatus(cluster.Namespace, cluster.Status.FEStatus, v1.GenerateStatefulSetSelector(cluster, v1.Component_FE), *cluster.Spec.FeSpec.Replicas) + return fc.ClassifyPodsByStatus(cluster.Namespace, cluster.Status.FEStatus, v1.GenerateStatefulSetSelector(cluster, v1.Component_FE), *cluster.Spec.FeSpec.Replicas, v1.Component_FE) } // New construct a FeController. diff --git a/pkg/controller/sub_controller/sub_controller.go b/pkg/controller/sub_controller/sub_controller.go index cf5f6979..80bdba72 100644 --- a/pkg/controller/sub_controller/sub_controller.go +++ b/pkg/controller/sub_controller/sub_controller.go @@ -59,13 +59,16 @@ type SubDefaultController struct { func (d *SubDefaultController) CheckRestartTimeAndInject(dcr *dorisv1.DorisCluster, componentType dorisv1.ComponentType) bool { var baseSpec *dorisv1.BaseSpec var restartedAt string + var restartAnnotationsKey string switch componentType { case dorisv1.Component_FE: baseSpec = &dcr.Spec.FeSpec.BaseSpec restartedAt = dcr.Annotations[dorisv1.FERestartAt] + restartAnnotationsKey = dorisv1.FERestartAt case dorisv1.Component_BE: baseSpec = &dcr.Spec.BeSpec.BaseSpec restartedAt = dcr.Annotations[dorisv1.BERestartAt] + restartAnnotationsKey = dorisv1.BERestartAt default: klog.Errorf("CheckRestartTimeAndInject dorisClusterName %s, namespace %s componentType %s not supported.", dcr.Name, dcr.Namespace, componentType) } @@ -80,7 +83,7 @@ func (d *SubDefaultController) CheckRestartTimeAndInject(dcr *dorisv1.DorisClust if err != nil { checkErr := fmt.Errorf("CheckRestartTimeAndInject error: time format is incorrect. dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s , error: %s", dcr.Name, dcr.Namespace, componentType, restartedAt, err.Error()) klog.Error(checkErr.Error()) - d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParamIllegal), checkErr.Error()) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParameterIllegal), checkErr.Error()) return false } @@ -89,48 +92,54 @@ func (d *SubDefaultController) CheckRestartTimeAndInject(dcr *dorisv1.DorisClust if restartAt.After(parseTime) { checkErr := fmt.Errorf("CheckRestartTimeAndInject The time has expired, dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s : The time has expired, if you want to restart doris, please set a future time", dcr.Name, dcr.Namespace, componentType, restartedAt) klog.Error(checkErr.Error()) - d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParamIllegal), checkErr.Error()) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParameterIllegal), checkErr.Error()) return false } - klog.Infof("CheckRestartTime successed, DCR %s in namespace %s, will restart %s ", dcr.Name, dcr.Namespace, componentType) - d.K8srecorder.Event( - dcr, - string(EventNormal), - string(RollingRestart), - fmt.Sprintf("CheckRestartTime successed, DCR %s in namespace %s, restart %s ", dcr.Name, dcr.Namespace, componentType), - ) - // check passed, set annotations to doriscluster baseSpec if baseSpec.Annotations == nil { baseSpec.Annotations = make(map[string]string) } - baseSpec.Annotations[dorisv1.DorisRollingRestartAt] = restartedAt + baseSpec.Annotations[restartAnnotationsKey] = restartedAt return true } // UpdateStatus update the component status on src. -func (d *SubDefaultController) UpdateStatus(namespace string, status *dorisv1.ComponentStatus, labels map[string]string, replicas int32) error { - return d.ClassifyPodsByStatus(namespace, status, labels, replicas) +func (d *SubDefaultController) UpdateStatus(namespace string, status *dorisv1.ComponentStatus, labels map[string]string, replicas int32, componentType dorisv1.ComponentType) error { + return d.ClassifyPodsByStatus(namespace, status, labels, replicas, componentType) } -func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *dorisv1.ComponentStatus, labels map[string]string, replicas int32) error { +func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *dorisv1.ComponentStatus, labels map[string]string, replicas int32, componentType dorisv1.ComponentType) error { var podList corev1.PodList if err := d.K8sclient.List(context.Background(), &podList, client.InNamespace(namespace), client.MatchingLabels(labels)); err != nil { return err } var creatings, readys, faileds []string - var firstRestartAnnotation string + var firstRestartAnnotation, restartAnnotationsKey string podmap := make(map[string]corev1.Pod) - if len(podList.Items) > 0 { - firstRestartAnnotation = podList.Items[0].Annotations[dorisv1.DorisRollingRestartAt] + + if len(podList.Items) == 0 { + status.RunningMembers = readys + status.FailedMembers = faileds + status.CreatingMembers = creatings + return nil } + switch componentType { + case dorisv1.Component_FE: + restartAnnotationsKey = dorisv1.FERestartAt + case dorisv1.Component_BE: + restartAnnotationsKey = dorisv1.BERestartAt + } + firstRestartAnnotation = podList.Items[0].Annotations[restartAnnotationsKey] + //get all pod status that controlled by st. stsRollingRestartAnnotationsSameCheck := true for _, pod := range podList.Items { - stsRollingRestartAnnotationsSameCheck = stsRollingRestartAnnotationsSameCheck && pod.Annotations[dorisv1.DorisRollingRestartAt] == firstRestartAnnotation + if pod.Annotations[restartAnnotationsKey] != firstRestartAnnotation { + stsRollingRestartAnnotationsSameCheck = false + } podmap[pod.Name] = pod if ready := k8s.PodIsReady(&pod.Status); ready { readys = append(readys, pod.Name) From c2cb4eb6eeaca81afd29d188858ec2ad78de66bd Mon Sep 17 00:00:00 2001 From: catpineapple Date: Fri, 22 Nov 2024 11:46:58 +0800 Subject: [PATCH 4/4] rolling restart --- pkg/controller/sub_controller/events.go | 3 +-- pkg/controller/sub_controller/sub_controller.go | 14 +++++--------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pkg/controller/sub_controller/events.go b/pkg/controller/sub_controller/events.go index b8b57c90..3e3c5993 100644 --- a/pkg/controller/sub_controller/events.go +++ b/pkg/controller/sub_controller/events.go @@ -70,8 +70,7 @@ var ( MSServiceDeletedFailed EventReason = "MSServiceDeletedFailed" MSStatefulsetDeleteFailed EventReason = "MSStatefulsetDeleteFailed" FDBAddressNotConfiged EventReason = "FDBAddressNotConfiged" - RestartParameterIllegal EventReason = "RestartParameterIllegal" - RollingRestart EventReason = "Restarting" + RestartTimeInvalid EventReason = "RestartTimeInvalid" ) type Event struct { diff --git a/pkg/controller/sub_controller/sub_controller.go b/pkg/controller/sub_controller/sub_controller.go index 80bdba72..4c93716e 100644 --- a/pkg/controller/sub_controller/sub_controller.go +++ b/pkg/controller/sub_controller/sub_controller.go @@ -83,16 +83,15 @@ func (d *SubDefaultController) CheckRestartTimeAndInject(dcr *dorisv1.DorisClust if err != nil { checkErr := fmt.Errorf("CheckRestartTimeAndInject error: time format is incorrect. dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s , error: %s", dcr.Name, dcr.Namespace, componentType, restartedAt, err.Error()) klog.Error(checkErr.Error()) - d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParameterIllegal), checkErr.Error()) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartTimeInvalid), checkErr.Error()) return false } - restartAt := time.Now().Add(-10 * time.Minute) + effectiveStartTime := time.Now().Add(-10 * time.Minute) - if restartAt.After(parseTime) { - checkErr := fmt.Errorf("CheckRestartTimeAndInject The time has expired, dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s : The time has expired, if you want to restart doris, please set a future time", dcr.Name, dcr.Namespace, componentType, restartedAt) - klog.Error(checkErr.Error()) - d.K8srecorder.Event(dcr, string(EventWarning), string(RestartParameterIllegal), checkErr.Error()) + if effectiveStartTime.After(parseTime) { + klog.Errorf("CheckRestartTimeAndInject The time has expired, dorisClusterName: %s, namespace: %s, componentType %s, wrong parse 'restartedAt': %s : The time has expired, if you want to restart doris, please set a future time", dcr.Name, dcr.Namespace, componentType, restartedAt) + d.K8srecorder.Event(dcr, string(EventWarning), string(RestartTimeInvalid), fmt.Sprintf("the %s restart time is not effective. the 'restartedAt' %s can't be earlier than 10 minutes before the current time", componentType, restartedAt)) return false } @@ -120,9 +119,6 @@ func (d *SubDefaultController) ClassifyPodsByStatus(namespace string, status *do podmap := make(map[string]corev1.Pod) if len(podList.Items) == 0 { - status.RunningMembers = readys - status.FailedMembers = faileds - status.CreatingMembers = creatings return nil }