do not update when the cluster unhealth

yb · yanboer · commit ebdeb304761f · 2022-11-30T12:44:17.000+08:00
diff --git a/docs/index.yaml b/docs/index.yaml
@@ -3,9 +3,9 @@ entries:
   postgres-operator:
   - apiVersion: v2
     appVersion: v1.1.1
-    created: "2022-11-14T13:27:14.428332408+08:00"
+    created: "2022-11-30T12:43:03.631056582+08:00"
     description: A Helm chart for Kubernetes
-    digest: 0e589d8472e5ab40c4286cc36ca96f773088b38916bcd753792711429991ba65
+    digest: bf36d07d7d5392f495105b37e7c3de1da247b99b66fef0d0afd52203262ef05d
     home: https://github.com/radondb/multi-platform-postgresql
     keywords:
     - operator
@@ -27,7 +27,7 @@ entries:
     version: v1.0.1
   - apiVersion: v2
     appVersion: v1.1.0
-    created: "2022-11-14T13:27:14.427794075+08:00"
+    created: "2022-11-30T12:43:03.630493625+08:00"
     description: A Helm chart for Kubernetes
     digest: 1f6593413ae373c5caa7b8c41a3cd3697fbb164ba417260a3fefa6d32a1852e8
     home: https://github.com/radondb/multi-platform-postgresql
@@ -49,4 +49,4 @@ entries:
     urls:
     - https://radondb.github.io/multi-platform-postgresql/postgres-operator-v1.0.0.tgz
     version: v1.0.0
-generated: "2022-11-14T13:27:14.427097554+08:00"
+generated: "2022-11-30T12:43:03.629722869+08:00"
diff --git a/docs/postgres-operator-v1.0.1.tgz b/docs/postgres-operator-v1.0.1.tgz
diff --git a/platforms/kubernetes/postgres-operator/deploy/postgres-operator.yaml.template b/platforms/kubernetes/postgres-operator/deploy/postgres-operator.yaml.template
@@ -33,16 +33,26 @@ spec:
       name: State
       type: string
       priority: 0           # show in standard view
+    - jsonPath: .spec.updatetoleration
+      description: update disable when the cluster status is unhealthy
+      name: Updatetoleration
+      type: boolean
+      priority: 1           # show in wide view
+    - jsonPath: .spec.volume_type
+      description: if volume type is local, rebuild pvc during rolling upgrade
+      name: Volumetype
+      type: string
+      priority: 1           # show in wide view
     - jsonPath: .spec.autofailover.podspec.containers[0].image
       description: The autofailover image
       name: FailoverImage
       type: string
-      priority: 1           # show in standard view
+      priority: 1           # show in wide view
     - jsonPath: .spec.postgresql.readwriteinstance.podspec.containers[0].image
       description: The postgresql image
       name: PostgresqlImage
       type: string
-      priority: 1           # show in standard view
+      priority: 1           # show in wide view
     - jsonPath: .spec.postgresql.readwriteinstance.replicas
       description: The readwriteinstance nodes
       name: RWnodes
@@ -93,11 +103,18 @@ spec:
                 enum:
                 - true
                 - false
+              updatetoleration:
+                type: boolean
+                enum:
+                - true
+                - false
+                default: false
               volume_type:
                 type: string
                 enum:
                 - 'local'
                 - 'cloud'
+                default: 'local'
               antiaffinity:
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
diff --git a/platforms/kubernetes/postgres-operator/deploy/postgresql.yaml b/platforms/kubernetes/postgres-operator/deploy/postgresql.yaml
@@ -6,7 +6,6 @@ metadata:
 spec:
   action: start #stop start
   deletepvc: true
-  volume_type: local       # local/cloud
   antiaffinity:
     policy: preferred                             # preferred/required
     podAntiAffinityTerm: autofailover-readwrite   # none/autofailover-readwrite/autofailover-readwrite-readonly
diff --git a/platforms/kubernetes/postgres-operator/postgres/constants.py b/platforms/kubernetes/postgres-operator/postgres/constants.py
@@ -32,6 +32,7 @@
 STREAMING_ASYNC = "async"
 STREAMING_SYNC = "sync"
 DELETE_PVC = "deletepvc"
+UPDATE_TOLERATION = "updatetoleration"
 POSTGRESQL_PVC_NAME = "data"
 SUCCESS = "exec_success"
 FAILED = "exec_failed"
diff --git a/platforms/kubernetes/postgres-operator/postgres/handle.py b/platforms/kubernetes/postgres-operator/postgres/handle.py
@@ -138,6 +138,7 @@
     MINUTES,
     HOURS,
     DAYS,
+    UPDATE_TOLERATION,
 )
 
 PGLOG_DIR = "log"
@@ -417,9 +418,12 @@ def waiting_cluster_final_status(
     status: kopf.Status,
     logger: logging.Logger,
     timeout: int = MINUTES * 1,
-) -> None:
+    except_nodes: int = None,
+) -> bool:
+    is_health = True
+
     if spec[ACTION] == ACTION_STOP:
-        return
+        return is_health
 
     # waiting for restart
     auto_failover_conns = connections(spec, meta, patch,
@@ -449,6 +453,7 @@ def waiting_cluster_final_status(
             if i >= maxtry:
                 logger.warning(
                     f"cluster maybe maybe not right. skip waitting.")
+                is_health = False
                 break
             output = exec_command(conn, primary_cmd, logger, interrupt=False)
             if output != '1':
@@ -475,6 +480,8 @@ def waiting_cluster_final_status(
                     spec.get(POSTGRESQL).get(READWRITEINSTANCE).get(MACHINES)
                 ) + len(
                     spec.get(POSTGRESQL).get(READONLYINSTANCE).get(MACHINES))
+            if except_nodes is not None:
+                total_nodes = except_nodes
             output = exec_command(conn, nodes_cmd, logger, interrupt=False)
             if output != str(total_nodes):
                 logger.warning(
@@ -484,6 +491,7 @@ def waiting_cluster_final_status(
 
             break
     auto_failover_conns.free_conns()
+    return is_health
 
 
 def waiting_cluster_correct_status(
@@ -3321,7 +3329,7 @@ def update_antiaffinity(
     timeout: int = MINUTES * 5,
 ) -> None:
     # local volume
-    if spec.get(SPEC_VOLUME_TYPE) == SPEC_VOLUME_LOCAL:
+    if spec.get(SPEC_VOLUME_TYPE, 'local') == SPEC_VOLUME_LOCAL:
         delete_disk = True
         timeout = HOURS * 1
     rolling_update(meta, spec, patch, status, logger, target_roles, exit,
@@ -3408,6 +3416,8 @@ def update_replicas(
 
         need_update_number_sync_standbys = True
 
+    waiting_cluster_final_status(meta, spec, patch, status, logger, 1 * HOURS)
+
     return need_update_number_sync_standbys
 
 
@@ -4019,6 +4029,48 @@ def local_create_user(OS: List,
         auto_failover_conns.free_conns()
 
 
+def get_except_nodes(
+    meta: kopf.Meta,
+    spec: kopf.Spec,
+    patch: kopf.Patch,
+    status: kopf.Status,
+    logger: logging.Logger,
+    diffs: kopf.Diff,
+) -> int:
+    mode, autofailover_replicas, readwrite_replicas, readonly_replicas = get_replicas(
+        spec)
+    except_readwrite_nodes = readwrite_replicas
+    except_readonly_nodes = readonly_replicas
+
+    for diff in diffs:
+        AC = diff[0]
+        FIELD = diff[1]
+        OLD = diff[2]
+        NEW = diff[3]
+
+        if FIELD == DIFF_FIELD_READWRITE_REPLICAS:
+            if AC != DIFF_CHANGE:
+                logger.error(
+                    str(DIFF_FIELD_ACTION) + " only support " + DIFF_CHANGE)
+            else:
+                except_readwrite_nodes = OLD
+
+        if FIELD == DIFF_FIELD_READWRITE_MACHINES:
+            if AC != DIFF_CHANGE:
+                logger.error(
+                    str(DIFF_FIELD_ACTION) + " only support " + DIFF_CHANGE)
+            else:
+                except_readwrite_nodes = len(OLD)
+
+        if FIELD == DIFF_FIELD_READONLY_REPLICAS:
+            except_readwrite_nodes = OLD
+
+        if FIELD == DIFF_FIELD_READONLY_MACHINES:
+            except_readwrite_nodes = len(OLD)
+
+    return except_readwrite_nodes + except_readonly_nodes
+
+
 # kubectl patch pg lzzhang --patch '{"spec": {"action": "stop"}}' --type=merge
 def update_cluster(
     meta: kopf.Meta,
@@ -4035,6 +4087,8 @@ def update_cluster(
         check_param(spec, logger, create=False)
         need_roll_update = False
         need_update_number_sync_standbys = False
+        update_toleration = spec.get(UPDATE_TOLERATION, False)
+        except_nodes = get_except_nodes(meta, spec, patch, status, logger, diffs)
 
         for diff in diffs:
             AC = diff[0]
@@ -4055,10 +4109,25 @@ def update_cluster(
             OLD = diff[2]
             NEW = diff[3]
 
+            if update_toleration == False and waiting_cluster_final_status(meta, spec, patch, status, logger, except_nodes=except_nodes) == False:
+                logger.error(f"cluster status is not health.")
+                raise kopf.PermanentError(f"cluster status is not health.")
+
             return_update_number_sync_standbys = update_replicas(meta, spec, patch, status, logger, AC, FIELD, OLD,
                             NEW)
             if need_update_number_sync_standbys == False and return_update_number_sync_standbys == True:
                 need_update_number_sync_standbys = True
+
+        for diff in diffs:
+            AC = diff[0]
+            FIELD = diff[1]
+            OLD = diff[2]
+            NEW = diff[3]
+
+            if update_toleration == False and waiting_cluster_final_status(meta, spec, patch, status, logger) == False:
+                logger.error(f"cluster status is not health.")
+                raise kopf.PermanentError(f"cluster status is not health.")
+
             update_podspec_volume(meta, spec, patch, status, logger, AC, FIELD,
                                   OLD, NEW)
             if FIELD[0:len(DIFF_FIELD_SPEC_ANTIAFFINITY
@@ -4077,6 +4146,10 @@ def update_cluster(
             OLD = diff[2]
             NEW = diff[3]
 
+            if update_toleration == False and waiting_cluster_final_status(meta, spec, patch, status, logger) == False:
+                logger.error(f"cluster status is not health.")
+                raise kopf.PermanentError(f"cluster status is not health.")
+
             update_hbas(meta, spec, patch, status, logger, AC, FIELD, OLD, NEW)
             update_users(meta, spec, patch, status, logger, AC, FIELD, OLD,
                          NEW)