Skip to content

Commit a61b8c4

Browse files
committed
Merge branch '350-postgres-shutdown' into 'master'
fix(engine): implement a general approach to gracefully shutting down Postgres containers (#350) Closes #350 See merge request postgres-ai/database-lab!547
2 parents 96e8184 + 9a21bea commit a61b8c4

File tree

6 files changed

+88
-32
lines changed

6 files changed

+88
-32
lines changed

Diff for: engine/cmd/database-lab/main.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func main() {
126126
emergencyShutdown := func() {
127127
cancel()
128128

129-
shutdownDatabaseLabEngine(context.Background(), docker, engProps, pm.First())
129+
shutdownDatabaseLabEngine(context.Background(), docker, &cfg.Global.Database, engProps.InstanceID, pm.First())
130130
}
131131

132132
cloningSvc := cloning.NewBase(&cfg.Cloning, provisioner, tm, observingChan)
@@ -191,7 +191,7 @@ func main() {
191191
log.Msg(err)
192192
}
193193

194-
shutdownDatabaseLabEngine(ctxBackground, docker, engProps, pm.First())
194+
shutdownDatabaseLabEngine(ctxBackground, docker, &cfg.Global.Database, engProps.InstanceID, pm.First())
195195
cloningSvc.SaveClonesState()
196196
tm.SendEvent(ctxBackground, telemetry.EngineStoppedEvent, telemetry.EngineStopped{Uptime: server.Uptime()})
197197
}
@@ -290,14 +290,14 @@ func setShutdownListener() chan os.Signal {
290290
return c
291291
}
292292

293-
func shutdownDatabaseLabEngine(ctx context.Context, dockerCLI *client.Client, engProps global.EngineProps, fsm pool.FSManager) {
293+
func shutdownDatabaseLabEngine(ctx context.Context, docker *client.Client, dbCfg *global.Database, instanceID string, fsm pool.FSManager) {
294294
log.Msg("Stopping auxiliary containers")
295295

296-
if err := cont.StopControlContainers(ctx, dockerCLI, engProps.InstanceID, fsm); err != nil {
296+
if err := cont.StopControlContainers(ctx, docker, dbCfg, instanceID, fsm); err != nil {
297297
log.Err("Failed to stop control containers", err)
298298
}
299299

300-
if err := cont.CleanUpSatelliteContainers(ctx, dockerCLI, engProps.InstanceID); err != nil {
300+
if err := cont.CleanUpSatelliteContainers(ctx, docker, instanceID); err != nil {
301301
log.Err("Failed to stop satellite containers", err)
302302
}
303303

Diff for: engine/internal/retrieval/engine/postgres/logical/dump.go

+4
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,10 @@ func (d *DumpJob) Run(ctx context.Context) (err error) {
357357
return errors.Wrap(err, "failed to recalculate statistics after restore")
358358
}
359359

360+
if err := tools.RunCheckpoint(ctx, d.dockerClient, containerID, d.globalCfg.Database.User(), d.globalCfg.Database.DBName); err != nil {
361+
return errors.Wrap(err, "failed to run checkpoint before stop")
362+
}
363+
360364
if err := tools.StopPostgres(ctx, d.dockerClient, containerID, dataDir, tools.DefaultStopTimeout); err != nil {
361365
return errors.Wrap(err, "failed to stop Postgres instance")
362366
}

Diff for: engine/internal/retrieval/engine/postgres/logical/restore.go

+4
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ func (r *RestoreJob) Run(ctx context.Context) (err error) {
259259
return errors.Wrap(err, "failed to recalculate statistics after restore")
260260
}
261261

262+
if err := tools.RunCheckpoint(ctx, r.dockerClient, containerID, r.globalCfg.Database.User(), r.globalCfg.Database.DBName); err != nil {
263+
return errors.Wrap(err, "failed to run checkpoint before stop")
264+
}
265+
262266
if err := tools.StopPostgres(ctx, r.dockerClient, containerID, dataDir, tools.DefaultStopTimeout); err != nil {
263267
return errors.Wrap(err, "failed to stop Postgres instance")
264268
}

Diff for: engine/internal/retrieval/engine/postgres/snapshot/physical.go

+32-25
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,10 @@ type syncState struct {
149149
}
150150

151151
// NewPhysicalInitialJob creates a new physical initial job.
152-
func NewPhysicalInitialJob(cfg config.JobConfig, global *global.Config, engineProps global.EngineProps, cloneManager pool.FSManager,
153-
tm *telemetry.Agent) (*PhysicalInitial, error) {
152+
func NewPhysicalInitialJob(
153+
cfg config.JobConfig, global *global.Config, engineProps global.EngineProps, cloneManager pool.FSManager,
154+
tm *telemetry.Agent,
155+
) (*PhysicalInitial, error) {
154156
p := &PhysicalInitial{
155157
name: cfg.Spec.Name,
156158
cloneManager: cloneManager,
@@ -397,7 +399,13 @@ func (p *PhysicalInitial) checkSyncInstance(ctx context.Context) (string, error)
397399

398400
log.Msg("Sync instance has been checked. It is running")
399401

400-
if err := p.checkpoint(ctx, syncContainer.ID); err != nil {
402+
if err := tools.RunCheckpoint(
403+
ctx,
404+
p.dockerClient,
405+
syncContainer.ID,
406+
p.globalCfg.Database.User(),
407+
p.globalCfg.Database.Name(),
408+
); err != nil {
401409
return "", errors.Wrap(err, "failed to make a checkpoint for sync instance")
402410
}
403411

@@ -616,9 +624,8 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string,
616624
}
617625
}
618626

619-
// Checkpoint.
620-
if err := p.checkpoint(ctx, containerID); err != nil {
621-
return err
627+
if err := tools.RunCheckpoint(ctx, p.dockerClient, containerID, p.globalCfg.Database.User(), p.globalCfg.Database.Name()); err != nil {
628+
return errors.Wrap(err, "failed to run checkpoint")
622629
}
623630

624631
if err := cfgManager.RemoveRecoveryConfig(); err != nil {
@@ -646,7 +653,10 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string,
646653
return nil
647654
}
648655

649-
func (p *PhysicalInitial) getDSAFromWAL(ctx context.Context, pgVersion float64, containerID, cloneDir string) (string, error) {
656+
func (p *PhysicalInitial) getDSAFromWAL(ctx context.Context, pgVersion float64, containerID, cloneDir string) (
657+
string,
658+
error,
659+
) {
650660
log.Dbg(cloneDir)
651661

652662
walDirectory := walDir(cloneDir, pgVersion)
@@ -692,7 +702,12 @@ func walDir(cloneDir string, pgVersion float64) string {
692702
return path.Join(cloneDir, dir)
693703
}
694704

695-
func (p *PhysicalInitial) parseWAL(ctx context.Context, containerID string, pgVersion float64, walFilePath string) string {
705+
func (p *PhysicalInitial) parseWAL(
706+
ctx context.Context,
707+
containerID string,
708+
pgVersion float64,
709+
walFilePath string,
710+
) string {
696711
cmd := walCommand(pgVersion, walFilePath)
697712

698713
output, err := tools.ExecCommandWithOutput(ctx, p.dockerClient, containerID, types.ExecConfig{
@@ -768,7 +783,11 @@ func buildRecoveryConfig(fileConfig, userRecoveryConfig map[string]string) map[s
768783
return recoveryConf
769784
}
770785

771-
func (p *PhysicalInitial) markDSA(ctx context.Context, defaultDSA, containerID, dataDir string, pgVersion float64) error {
786+
func (p *PhysicalInitial) markDSA(
787+
ctx context.Context,
788+
defaultDSA, containerID, dataDir string,
789+
pgVersion float64,
790+
) error {
772791
extractedDataStateAt, err := p.extractDataStateAt(ctx, containerID, dataDir, pgVersion, defaultDSA)
773792
if err != nil {
774793
if defaultDSA == "" {
@@ -895,8 +914,10 @@ and the source doesn't have enough activity.
895914
Step 3. Use the timestamp of the latest checkpoint. This is extracted from PGDATA using the
896915
pg_controldata utility. Note that this is not an exact value of the latest activity in the source
897916
before we took a copy of PGDATA, but we suppose it is not far from it. */
898-
func (p *PhysicalInitial) extractDataStateAt(ctx context.Context, containerID, dataDir string, pgVersion float64,
899-
defaultDSA string) (string, error) {
917+
func (p *PhysicalInitial) extractDataStateAt(
918+
ctx context.Context, containerID, dataDir string, pgVersion float64,
919+
defaultDSA string,
920+
) (string, error) {
900921
output, err := p.getLastXActReplayTimestamp(ctx, containerID)
901922
if err != nil {
902923
log.Dbg("unable to get last replay timestamp from the promotion container: ", err)
@@ -1002,20 +1023,6 @@ func (p *PhysicalInitial) runPromoteCommand(ctx context.Context, containerID, cl
10021023
return nil
10031024
}
10041025

1005-
func (p *PhysicalInitial) checkpoint(ctx context.Context, containerID string) error {
1006-
commandCheckpoint := []string{"psql", "-U", p.globalCfg.Database.User(), "-d", p.globalCfg.Database.Name(), "-XAtc", "checkpoint"}
1007-
log.Msg("Run checkpoint command", commandCheckpoint)
1008-
1009-
output, err := tools.ExecCommandWithOutput(ctx, p.dockerClient, containerID, types.ExecConfig{Cmd: commandCheckpoint})
1010-
if err != nil {
1011-
return errors.Wrap(err, "failed to make checkpoint")
1012-
}
1013-
1014-
log.Msg("Checkpoint result: ", output)
1015-
1016-
return nil
1017-
}
1018-
10191026
func (p *PhysicalInitial) markDatabaseData() error {
10201027
if err := p.dbMarker.CreateConfig(); err != nil {
10211028
return errors.Wrap(err, "failed to create a DBMarker config of the database")

Diff for: engine/internal/retrieval/engine/postgres/tools/cont/container.go

+9-2
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ import (
1414
"github.com/docker/docker/api/types/container"
1515
"github.com/docker/docker/api/types/filters"
1616
"github.com/docker/docker/client"
17-
units "github.com/docker/go-units"
17+
"github.com/docker/go-units"
1818
"github.com/pkg/errors"
1919

2020
"gitlab.com/postgres-ai/database-lab/v3/internal/provision/pool"
2121
"gitlab.com/postgres-ai/database-lab/v3/internal/retrieval/engine/postgres/tools"
2222
"gitlab.com/postgres-ai/database-lab/v3/internal/retrieval/options"
23+
"gitlab.com/postgres-ai/database-lab/v3/pkg/config/global"
2324
"gitlab.com/postgres-ai/database-lab/v3/pkg/log"
2425
)
2526

@@ -64,7 +65,8 @@ const (
6465
// TODO(akartasov): Control container manager.
6566

6667
// StopControlContainers stops control containers run by Database Lab Engine.
67-
func StopControlContainers(ctx context.Context, dockerClient *client.Client, instanceID string, fsm pool.FSManager) error {
68+
func StopControlContainers(ctx context.Context, dockerClient *client.Client, dbCfg *global.Database, instanceID string,
69+
fsm pool.FSManager) error {
6870
log.Msg("Stop control containers")
6971

7072
list, err := getContainerList(ctx, dockerClient, instanceID, getControlContainerFilters())
@@ -84,6 +86,11 @@ func StopControlContainers(ctx context.Context, dockerClient *client.Client, ins
8486
if shouldStopInternalProcess(controlLabel) && fsm != nil {
8587
log.Msg("Stopping control container: ", containerName)
8688

89+
if err := tools.RunCheckpoint(ctx, dockerClient, controlCont.ID, dbCfg.User(), dbCfg.Name()); err != nil {
90+
log.Msg("Failed to make a checkpoint:", err)
91+
tools.PrintContainerLogs(ctx, dockerClient, controlCont.ID)
92+
}
93+
8794
if err := tools.StopPostgres(ctx, dockerClient, controlCont.ID, fsm.Pool().DataDir(), tools.DefaultStopTimeout); err != nil {
8895
log.Msg("Failed to stop Postgres", err)
8996
tools.PrintContainerLogs(ctx, dockerClient, controlCont.ID)

Diff for: engine/internal/retrieval/engine/postgres/tools/tools.go

+34
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,40 @@ func StartPostgres(ctx context.Context, dockerClient *client.Client, containerID
258258
return nil
259259
}
260260

261+
// RunCheckpoint runs checkpoint, usually before the postgres stop
262+
func RunCheckpoint(
263+
ctx context.Context,
264+
dockerClient *client.Client,
265+
containerID string,
266+
user string,
267+
database string,
268+
) error {
269+
commandCheckpoint := []string{
270+
"psql",
271+
"-U",
272+
user,
273+
"-d",
274+
database,
275+
"-XAtc",
276+
"checkpoint",
277+
}
278+
log.Msg("Run checkpoint command", commandCheckpoint)
279+
280+
output, err := ExecCommandWithOutput(
281+
ctx,
282+
dockerClient,
283+
containerID,
284+
types.ExecConfig{Cmd: commandCheckpoint},
285+
)
286+
if err != nil {
287+
return errors.Wrap(err, "failed to make checkpoint")
288+
}
289+
290+
log.Msg("Checkpoint result: ", output)
291+
292+
return nil
293+
}
294+
261295
// StopPostgres stops Postgres inside container.
262296
func StopPostgres(ctx context.Context, dockerClient *client.Client, containerID, dataDir string, timeout int) error {
263297
pgVersion, err := DetectPGVersion(dataDir)

0 commit comments

Comments
 (0)