Skip to content

Commit b0cfeb3

Browse files
authored
Partially revert #2810 (#2849)
Only schedule switchover for pod migration, consider mainWindow for PGVERSION env change
1 parent e04b91d commit b0cfeb3

File tree

6 files changed

+48
-37
lines changed

6 files changed

+48
-37
lines changed

docs/administrator.md

-3
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,6 @@ Note that, changes in `SPILO_CONFIGURATION` env variable under `bootstrap.dcs`
208208
path are ignored for the diff. They will be applied through Patroni's rest api
209209
interface, following a restart of all instances.
210210

211-
Rolling update is postponed until the next maintenance window if any is defined
212-
under the `maintenanceWindows` cluster manifest parameter.
213-
214211
The operator also support lazy updates of the Spilo image. In this case the
215212
StatefulSet is only updated, but no rolling update follows. This feature saves
216213
you a switchover - and hence downtime - when you know pods are re-started later

docs/reference/cluster_manifest.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ These parameters are grouped directly under the `spec` key in the manifest.
116116

117117
* **maintenanceWindows**
118118
a list which defines specific time frames when certain maintenance operations
119-
such as automatic major upgrades or rolling updates are allowed. Accepted formats
119+
such as automatic major upgrades or master pod migration. Accepted formats
120120
are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific
121121
days, with all times in UTC.
122122

e2e/tests/test_e2e.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ def test_major_version_upgrade(self):
11871187
Test major version upgrade: with full upgrade, maintenance window, and annotation
11881188
"""
11891189
def check_version():
1190-
p = k8s.patroni_rest("acid-upgrade-test-0", "")
1190+
p = k8s.patroni_rest("acid-upgrade-test-0", "") or {}
11911191
version = p.get("server_version", 0) // 10000
11921192
return version
11931193

@@ -1237,7 +1237,7 @@ def get_annotations():
12371237
# should not upgrade because current time is not in maintenanceWindow
12381238
current_time = datetime.now()
12391239
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
1240-
pg_patch_version_15 = {
1240+
pg_patch_version_15_outside_mw = {
12411241
"spec": {
12421242
"postgresql": {
12431243
"version": "15"
@@ -1248,7 +1248,7 @@ def get_annotations():
12481248
}
12491249
}
12501250
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1251-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
1251+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_outside_mw)
12521252
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12531253

12541254
# no pod replacement outside of the maintenance window
@@ -1259,12 +1259,12 @@ def get_annotations():
12591259
second_annotations = get_annotations()
12601260
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
12611261

1262-
# change the version again to trigger operator sync
1262+
# change maintenanceWindows to current
12631263
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
1264-
pg_patch_version_16 = {
1264+
pg_patch_version_15_in_mw = {
12651265
"spec": {
12661266
"postgresql": {
1267-
"version": "16"
1267+
"version": "15"
12681268
},
12691269
"maintenanceWindows": [
12701270
maintenance_window_current
@@ -1273,13 +1273,13 @@ def get_annotations():
12731273
}
12741274

12751275
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1276-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
1276+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
12771277
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12781278

12791279
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
12801280
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12811281
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1282-
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
1282+
self.eventuallyEqual(check_version, 15, "Version should be upgraded from 14 to 15")
12831283

12841284
# check if annotation for last upgrade's success is updated after second upgrade
12851285
third_annotations = get_annotations()
@@ -1306,16 +1306,17 @@ def get_annotations():
13061306
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
13071307
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
13081308
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1309-
self.eventuallyEqual(check_version, 16, "Version should not be upgraded because annotation for last upgrade's failure is set")
1309+
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")
13101310

13111311
# change the version back to 15 and should remove failure annotation
13121312
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1313-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
1313+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
13141314
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
13151315

13161316
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
13171317
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
13181318

1319+
self.eventuallyEqual(check_version, 15, "Version should not be upgraded from 15")
13191320
fourth_annotations = get_annotations()
13201321
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")
13211322

pkg/cluster/cluster.go

+23-15
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,11 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
957957
defer c.mu.Unlock()
958958

959959
c.KubeClient.SetPostgresCRDStatus(c.clusterName(), acidv1.ClusterStatusUpdating)
960+
961+
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
962+
// do not apply any major version related changes yet
963+
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
964+
}
960965
c.setSpec(newSpec)
961966

962967
defer func() {
@@ -1761,35 +1766,38 @@ func (c *Cluster) GetSwitchoverSchedule() string {
17611766
}
17621767

17631768
// Switchover does a switchover (via Patroni) to a candidate pod
1764-
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
1769+
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName, scheduled bool) error {
17651770
var err error
1766-
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
1767-
1768-
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
1769-
c.logger.Infof("postponing switchover, not in maintenance window")
1770-
schedule := c.GetSwitchoverSchedule()
17711771

1772-
if err := c.patroni.Switchover(curMaster, candidate.Name, schedule); err != nil {
1773-
return fmt.Errorf("could not schedule switchover: %v", err)
1774-
}
1775-
c.logger.Infof("switchover is scheduled at %s", schedule)
1776-
return nil
1777-
}
1778-
1779-
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
17801772
stopCh := make(chan struct{})
17811773
ch := c.registerPodSubscriber(candidate)
17821774
defer c.unregisterPodSubscriber(candidate)
17831775
defer close(stopCh)
17841776

1785-
if err = c.patroni.Switchover(curMaster, candidate.Name, ""); err == nil {
1777+
var scheduled_at string
1778+
if scheduled {
1779+
scheduled_at = c.GetSwitchoverSchedule()
1780+
} else {
1781+
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
1782+
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
1783+
scheduled_at = ""
1784+
}
1785+
1786+
if err = c.patroni.Switchover(curMaster, candidate.Name, scheduled_at); err == nil {
1787+
if scheduled {
1788+
c.logger.Infof("switchover from %q to %q is scheduled at %s", curMaster.Name, candidate, scheduled_at)
1789+
return nil
1790+
}
17861791
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
17871792
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
17881793
_, err = c.waitForPodLabel(ch, stopCh, nil)
17891794
if err != nil {
17901795
err = fmt.Errorf("could not get master pod label: %v", err)
17911796
}
17921797
} else {
1798+
if scheduled {
1799+
return fmt.Errorf("could not schedule switchover: %v", err)
1800+
}
17931801
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
17941802
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
17951803
}

pkg/cluster/pod.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,16 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
280280
return fmt.Errorf("could not move pod: %v", err)
281281
}
282282

283+
scheduleSwitchover := false
284+
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
285+
c.logger.Infof("postponing switchover, not in maintenance window")
286+
scheduleSwitchover = true
287+
}
283288
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
284289
func() (bool, error) {
285-
err := c.Switchover(oldMaster, masterCandidateName)
290+
err := c.Switchover(oldMaster, masterCandidateName, scheduleSwitchover)
286291
if err != nil {
287-
c.logger.Errorf("could not failover to pod %q: %v", masterCandidateName, err)
292+
c.logger.Errorf("could not switchover to pod %q: %v", masterCandidateName, err)
288293
return false, nil
289294
}
290295
return true, nil
@@ -445,7 +450,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
445450
// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
446451
return fmt.Errorf("skipping switchover: %v", err)
447452
}
448-
if err := c.Switchover(masterPod, masterCandidate); err != nil {
453+
if err := c.Switchover(masterPod, masterCandidate, false); err != nil {
449454
return fmt.Errorf("could not perform switch over: %v", err)
450455
}
451456
} else if newMasterPod == nil && len(replicas) == 0 {

pkg/cluster/sync.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
9797
}
9898
}
9999

100+
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
101+
// do not apply any major version related changes yet
102+
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
103+
}
104+
100105
if err = c.syncStatefulSet(); err != nil {
101106
if !k8sutil.ResourceAlreadyExists(err) {
102107
err = fmt.Errorf("could not sync statefulsets: %v", err)
@@ -658,11 +663,6 @@ func (c *Cluster) syncStatefulSet() error {
658663
isSafeToRecreatePods = false
659664
}
660665

661-
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
662-
postponeReasons = append(postponeReasons, "not in maintenance window")
663-
isSafeToRecreatePods = false
664-
}
665-
666666
// if we get here we also need to re-create the pods (either leftovers from the old
667667
// statefulset or those that got their configuration from the outdated statefulset)
668668
if len(podsToRecreate) > 0 {

0 commit comments

Comments
 (0)