Skip to content

Commit 3e5da9b

Browse files
committed
Merge branch '236-promote-strategy' into 'master'
fix: check the sync instance state before running the promotion stage (#236) Closes #236 See merge request postgres-ai/database-lab!277
2 parents cd40526 + cca2183 commit 3e5da9b

File tree

4 files changed

+101
-12
lines changed

4 files changed

+101
-12
lines changed

pkg/retrieval/engine/postgres/snapshot/physical.go

+50-5
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,14 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
298298
}
299299
}()
300300

301+
var syncErr error
302+
303+
if p.options.Promotion.Enabled {
304+
if syncErr = p.checkSyncInstance(ctx); syncErr != nil {
305+
log.Dbg(fmt.Sprintf("failed to check the sync instance before snapshotting: %v", syncErr), "Changing the promotion strategy")
306+
}
307+
}
308+
301309
// Prepare pre-snapshot.
302310
snapshotName, err := p.cloneManager.CreateSnapshot("", preDataStateAt+pre)
303311
if err != nil {
@@ -326,7 +334,7 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
326334

327335
// Promotion.
328336
if p.options.Promotion.Enabled {
329-
if err := p.promoteInstance(ctx, path.Join(p.fsPool.ClonesDir(), cloneName, p.fsPool.DataSubDir)); err != nil {
337+
if err := p.promoteInstance(ctx, path.Join(p.fsPool.ClonesDir(), cloneName, p.fsPool.DataSubDir), syncErr); err != nil {
330338
return errors.Wrap(err, "failed to promote instance")
331339
}
332340
}
@@ -353,6 +361,29 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
353361
return nil
354362
}
355363

364+
func (p *PhysicalInitial) checkSyncInstance(ctx context.Context) error {
365+
syncContainer, err := p.dockerClient.ContainerInspect(ctx, p.syncInstanceName())
366+
if err != nil {
367+
return err
368+
}
369+
370+
if err := tools.CheckContainerReadiness(ctx, p.dockerClient, syncContainer.ID); err != nil {
371+
return errors.Wrap(err, "failed to readiness check")
372+
}
373+
374+
log.Msg("Sync instance has been checked. It is running")
375+
376+
if err := p.checkpoint(ctx, syncContainer.ID); err != nil {
377+
return errors.Wrap(err, "failed to make a checkpoint for sync instance")
378+
}
379+
380+
return nil
381+
}
382+
383+
func (p *PhysicalInitial) syncInstanceName() string {
384+
return cont.SyncInstanceContainerPrefix + p.globalCfg.InstanceID
385+
}
386+
356387
func (p *PhysicalInitial) startScheduler(ctx context.Context) {
357388
if p.scheduler == nil || !p.hasSchedulingOptions() {
358389
return
@@ -409,7 +440,7 @@ func (p *PhysicalInitial) promoteContainerName() string {
409440
return promoteContainerPrefix + p.globalCfg.InstanceID
410441
}
411442

412-
func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string) (err error) {
443+
func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string, syncErr error) (err error) {
413444
p.promotionMutex.Lock()
414445
defer p.promotionMutex.Unlock()
415446

@@ -436,10 +467,17 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string)
436467
}
437468
}
438469

439-
recoveryConfig := buildRecoveryConfig(recoveryFileConfig, p.options.Promotion.Recovery)
470+
recoveryConfig := make(map[string]string)
471+
472+
// Item 5. Remove a recovery file: https://gitlab.com/postgres-ai/database-lab/-/issues/236#note_513401256
473+
if syncErr != nil {
474+
recoveryConfig = buildRecoveryConfig(recoveryFileConfig, p.options.Promotion.Recovery)
440475

441-
if err := cfgManager.ApplyRecovery(recoveryFileConfig); err != nil {
442-
return errors.Wrap(err, "failed to apply recovery configuration")
476+
if err := cfgManager.ApplyRecovery(recoveryFileConfig); err != nil {
477+
return errors.Wrap(err, "failed to apply recovery configuration")
478+
}
479+
} else if err := cfgManager.RemoveRecoveryConfig(); err != nil {
480+
log.Err(errors.Wrap(err, "failed to remove recovery config file"))
443481
}
444482

445483
// Apply promotion configs.
@@ -557,6 +595,13 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string)
557595
return errors.Wrap(err, "failed to store prepared configuration")
558596
}
559597

598+
const pgStopTimeout = 600
599+
600+
if err := tools.StopPostgres(ctx, p.dockerClient, promoteCont.ID, clonePath, pgStopTimeout); err != nil {
601+
log.Msg("Failed to stop Postgres", err)
602+
tools.PrintContainerLogs(ctx, p.dockerClient, promoteCont.ID)
603+
}
604+
560605
return nil
561606
}
562607

pkg/retrieval/engine/postgres/tools/cont/container.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func StopControlContainers(ctx context.Context, dockerClient *client.Client, ins
7070
if shouldStopInternalProcess(controlLabel) {
7171
log.Msg("Stopping control container: ", containerName)
7272

73-
if err := tools.StopPostgres(ctx, dockerClient, controlCont.ID, dataDir); err != nil {
73+
if err := tools.StopPostgres(ctx, dockerClient, controlCont.ID, dataDir, tools.DefaultStopTimeout); err != nil {
7474
log.Msg("Failed to stop Postgres", err)
7575
tools.PrintContainerLogs(ctx, dockerClient, controlCont.ID)
7676

pkg/retrieval/engine/postgres/tools/tools.go

+10-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ const (
4040
maxValuesToReturn = 1
4141
essentialLogsInterval = "10s"
4242

43+
// DefaultStopTimeout defines the default timeout for Postgres stop.
44+
DefaultStopTimeout = 60
45+
4346
// ViewLogsCmd tells the command to view docker container logs.
4447
ViewLogsCmd = "docker logs --since 1m -f"
4548

@@ -156,15 +159,20 @@ func GetMountsFromMountPoints(dataDir string, mountPoints []types.MountPoint) []
156159
}
157160

158161
// StopPostgres stops Postgres inside container.
159-
func StopPostgres(ctx context.Context, dockerClient *client.Client, containerID, dataDir string) error {
162+
func StopPostgres(ctx context.Context, dockerClient *client.Client, containerID, dataDir string, timeout int) error {
160163
pgVersion, err := DetectPGVersion(dataDir)
161164
if err != nil {
162165
return errors.Wrap(err, "failed to detect PostgreSQL version")
163166
}
164167

168+
stopCommand := []string{fmt.Sprintf("/usr/lib/postgresql/%g/bin/pg_ctl", pgVersion),
169+
"-D", dataDir, "-w", "--timeout", strconv.Itoa(timeout), "stop"}
170+
171+
log.Msg("Stopping PostgreSQL instance", stopCommand)
172+
165173
if err := ExecCommand(ctx, dockerClient, containerID, types.ExecConfig{
166174
User: defaults.Username,
167-
Cmd: []string{fmt.Sprintf("/usr/lib/postgresql/%g/bin/pg_ctl", pgVersion), "-D", dataDir, "stop"},
175+
Cmd: stopCommand,
168176
}); err != nil {
169177
return errors.Wrap(err, "failed to stop Postgres")
170178
}

pkg/services/provision/databases/postgres/pgconfig/configuration.go

+40-4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ const (
4242
// recoveryConfName defines the name of recovery Postgres (<11) config.
4343
recoveryConfName = "recovery.conf"
4444

45+
// standbySignal defines the name of the file which means that standby mode is initialized for Postgres (>=12).
46+
standbySignal = "standby.signal"
47+
48+
// recoverySignal defines the name of the file which means that recovery is initialized for Postgres (>=12).
49+
recoverySignal = "recovery.signal"
50+
4551
// Database Lab configuration files.
4652
// configPrefix defines a file prefix for Database Lab configuration files.
4753
configPrefix = "postgresql.dblab."
@@ -307,7 +313,7 @@ func (m *Manager) ApplyRecovery(cfg map[string]string) error {
307313
}
308314

309315
if m.pgVersion >= defaults.PGVersion12 {
310-
if err := tools.TouchFile(path.Join(m.dataDir, "standby.signal")); err != nil {
316+
if err := tools.TouchFile(path.Join(m.dataDir, standbySignal)); err != nil {
311317
return err
312318
}
313319
}
@@ -335,10 +341,30 @@ func (m *Manager) TruncateRecoveryConfig() error {
335341

336342
// RemoveRecoveryConfig removes a recovery configuration file.
337343
func (m *Manager) RemoveRecoveryConfig() error {
338-
err := os.Remove(m.recoveryPath())
339-
if pathError, ok := err.(*os.PathError); ok {
340-
log.Dbg("failed to remove a recovery configuration file: ", pathError.Error())
344+
if err := m.removeOptionally(m.recoveryPath()); err != nil {
345+
return err
346+
}
341347

348+
if m.pgVersion < defaults.PGVersion12 {
349+
return nil
350+
}
351+
352+
if err := m.removeOptionally(m.standbySignalPath()); err != nil {
353+
return err
354+
}
355+
356+
if err := m.removeOptionally(m.recoverySignalPath()); err != nil {
357+
return err
358+
}
359+
360+
return nil
361+
}
362+
363+
func (m *Manager) removeOptionally(filepath string) error {
364+
err := os.Remove(filepath)
365+
366+
if pathError, ok := err.(*os.PathError); ok {
367+
log.Dbg(pathError.Error())
342368
return nil
343369
}
344370

@@ -420,6 +446,16 @@ func (m Manager) recoveryFilename() string {
420446
return recoveryConfName
421447
}
422448

449+
// recoverySignalPath returns the path of the recovery signal file.
450+
func (m Manager) recoverySignalPath() string {
451+
return path.Join(m.dataDir, recoverySignal)
452+
}
453+
454+
// standbySignalPath returns the path of the standby signal file.
455+
func (m Manager) standbySignalPath() string {
456+
return path.Join(m.dataDir, standbySignal)
457+
}
458+
423459
// rewriteConfig completely rewrite a configuration file with provided parameters.
424460
func (m *Manager) rewriteConfig(pgConf string, extraConfig map[string]string) error {
425461
log.Dbg("Applying configuration: ", pgConf)

0 commit comments

Comments
 (0)