@@ -134,6 +134,12 @@ type QueryPreprocessing struct {
134
134
MaxParallelWorkers int `yaml:"maxParallelWorkers"`
135
135
}
136
136
137
+ // syncState defines state of a sync instance.
138
+ type syncState struct {
139
+ DSA string
140
+ Err error
141
+ }
142
+
137
143
// NewPhysicalInitialJob creates a new physical initial job.
138
144
func NewPhysicalInitialJob (cfg config.JobConfig , global * dblabCfg.Global , cloneManager pool.FSManager ) (* PhysicalInitial , error ) {
139
145
p := & PhysicalInitial {
@@ -298,11 +304,14 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
298
304
}
299
305
}()
300
306
301
- var syncErr error
307
+ var syState syncState
302
308
303
309
if p .options .Promotion .Enabled {
304
- if syncErr = p .checkSyncInstance (ctx ); syncErr != nil {
305
- log .Dbg (fmt .Sprintf ("failed to check the sync instance before snapshotting: %v" , syncErr ), "Changing the promotion strategy" )
310
+ syState .DSA , syState .Err = p .checkSyncInstance (ctx )
311
+
312
+ if syState .Err != nil {
313
+ log .Dbg (fmt .Sprintf ("failed to check the sync instance before snapshotting: %v" , syState ),
314
+ "Recovery configs will be applied on the promotion stage" )
306
315
}
307
316
}
308
317
@@ -334,7 +343,7 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
334
343
335
344
// Promotion.
336
345
if p .options .Promotion .Enabled {
337
- if err := p .promoteInstance (ctx , path .Join (p .fsPool .ClonesDir (), cloneName , p .fsPool .DataSubDir ), syncErr ); err != nil {
346
+ if err := p .promoteInstance (ctx , path .Join (p .fsPool .ClonesDir (), cloneName , p .fsPool .DataSubDir ), syState ); err != nil {
338
347
return errors .Wrap (err , "failed to promote instance" )
339
348
}
340
349
}
@@ -361,23 +370,32 @@ func (p *PhysicalInitial) run(ctx context.Context) (err error) {
361
370
return nil
362
371
}
363
372
364
- func (p * PhysicalInitial ) checkSyncInstance (ctx context.Context ) error {
373
+ func (p * PhysicalInitial ) checkSyncInstance (ctx context.Context ) (string , error ) {
374
+ log .Msg ("Check the sync instance state: " , p .syncInstanceName ())
375
+
365
376
syncContainer , err := p .dockerClient .ContainerInspect (ctx , p .syncInstanceName ())
366
377
if err != nil {
367
- return err
378
+ return "" , err
368
379
}
369
380
370
381
if err := tools .CheckContainerReadiness (ctx , p .dockerClient , syncContainer .ID ); err != nil {
371
- return errors .Wrap (err , "failed to readiness check" )
382
+ return "" , errors .Wrap (err , "failed to readiness check" )
372
383
}
373
384
374
385
log .Msg ("Sync instance has been checked. It is running" )
375
386
376
387
if err := p .checkpoint (ctx , syncContainer .ID ); err != nil {
377
- return errors .Wrap (err , "failed to make a checkpoint for sync instance" )
388
+ return "" , errors .Wrap (err , "failed to make a checkpoint for sync instance" )
378
389
}
379
390
380
- return nil
391
+ extractedDataStateAt , err := p .getLastXActReplayTimestamp (ctx , syncContainer .ID )
392
+ if err != nil {
393
+ return "" , errors .Wrap (err , `failed to get last xact replay timestamp from the sync instance` )
394
+ }
395
+
396
+ log .Msg ("Sync instance data state at: " , extractedDataStateAt )
397
+
398
+ return extractedDataStateAt , nil
381
399
}
382
400
383
401
func (p * PhysicalInitial ) syncInstanceName () string {
@@ -440,7 +458,7 @@ func (p *PhysicalInitial) promoteContainerName() string {
440
458
return promoteContainerPrefix + p .globalCfg .InstanceID
441
459
}
442
460
443
- func (p * PhysicalInitial ) promoteInstance (ctx context.Context , clonePath string , syncErr error ) (err error ) {
461
+ func (p * PhysicalInitial ) promoteInstance (ctx context.Context , clonePath string , syState syncState ) (err error ) {
444
462
p .promotionMutex .Lock ()
445
463
defer p .promotionMutex .Unlock ()
446
464
@@ -470,7 +488,7 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string,
470
488
recoveryConfig := make (map [string ]string )
471
489
472
490
// Item 5. Remove a recovery file: https://gitlab.com/postgres-ai/database-lab/-/issues/236#note_513401256
473
- if syncErr != nil {
491
+ if syState . Err != nil {
474
492
recoveryConfig = buildRecoveryConfig (recoveryFileConfig , p .options .Promotion .Recovery )
475
493
476
494
if err := cfgManager .ApplyRecovery (recoveryFileConfig ); err != nil {
@@ -563,7 +581,7 @@ func (p *PhysicalInitial) promoteInstance(ctx context.Context, clonePath string,
563
581
}
564
582
}
565
583
566
- if err := p .markDSA (ctx , promoteCont .ID , clonePath , cfgManager .GetPgVersion ()); err != nil {
584
+ if err := p .markDSA (ctx , syState . DSA , promoteCont .ID , clonePath , cfgManager .GetPgVersion ()); err != nil {
567
585
return errors .Wrap (err , "failed to mark dataStateAt" )
568
586
}
569
587
@@ -621,13 +639,18 @@ func buildRecoveryConfig(fileConfig, userRecoveryConfig map[string]string) map[s
621
639
return recoveryConf
622
640
}
623
641
624
- func (p * PhysicalInitial ) markDSA (ctx context.Context , containerID , dataDir string , pgVersion float64 ) error {
642
+ func (p * PhysicalInitial ) markDSA (ctx context.Context , defaultDSA , containerID , dataDir string , pgVersion float64 ) error {
625
643
extractedDataStateAt , err := p .extractDataStateAt (ctx , containerID , dataDir , pgVersion )
626
644
if err != nil {
627
- return errors .Wrap (err , `failed to extract dataStateAt` )
645
+ if defaultDSA == "" {
646
+ return errors .Wrap (err , `failed to extract dataStateAt` )
647
+ }
648
+
649
+ log .Msg ("failed to extract dataStateAt. Use value from the sync instance: " , defaultDSA )
650
+ extractedDataStateAt = defaultDSA
628
651
}
629
652
630
- log .Msg ("Extracted Data state at: " , extractedDataStateAt )
653
+ log .Msg ("Data state at: " , extractedDataStateAt )
631
654
632
655
if p .dbMark .DataStateAt != "" && extractedDataStateAt == p .dbMark .DataStateAt {
633
656
return newSkipSnapshotErr (fmt .Sprintf (
@@ -637,7 +660,7 @@ func (p *PhysicalInitial) markDSA(ctx context.Context, containerID, dataDir stri
637
660
638
661
p .dbMark .DataStateAt = extractedDataStateAt
639
662
640
- log .Msg ("Data state at: " , p .dbMark .DataStateAt )
663
+ log .Msg ("Mark data state at: " , p .dbMark .DataStateAt )
641
664
642
665
return nil
643
666
}
@@ -726,15 +749,7 @@ func (p *PhysicalInitial) checkRecovery(ctx context.Context, containerID string)
726
749
}
727
750
728
751
func (p * PhysicalInitial ) extractDataStateAt (ctx context.Context , containerID , dataDir string , pgVersion float64 ) (string , error ) {
729
- extractionCommand := []string {"psql" , "-U" , p .globalCfg .Database .User (), "-d" , p .globalCfg .Database .Name (), "-XAtc" ,
730
- "select to_char(pg_last_xact_replay_timestamp() at time zone 'UTC', 'YYYYMMDDHH24MISS')" }
731
-
732
- log .Msg ("Running dataStateAt command" , extractionCommand )
733
-
734
- output , err := tools .ExecCommandWithOutput (ctx , p .dockerClient , containerID , types.ExecConfig {
735
- Cmd : extractionCommand ,
736
- User : defaults .Username ,
737
- })
752
+ output , err := p .getLastXActReplayTimestamp (ctx , containerID )
738
753
739
754
if output == "" {
740
755
log .Msg ("The last replay timestamp not found. Extract the last checkpoint timestamp" )
@@ -755,6 +770,22 @@ func (p *PhysicalInitial) extractDataStateAt(ctx context.Context, containerID, d
755
770
return output , err
756
771
}
757
772
773
+ func (p * PhysicalInitial ) getLastXActReplayTimestamp (ctx context.Context , containerID string ) (string , error ) {
774
+ extractionCommand := []string {"psql" , "-U" , p .globalCfg .Database .User (), "-d" , p .globalCfg .Database .Name (), "-XAtc" ,
775
+ "select to_char(pg_last_xact_replay_timestamp() at time zone 'UTC', 'YYYYMMDDHH24MISS')" }
776
+
777
+ log .Msg ("Running dataStateAt command" , extractionCommand )
778
+
779
+ output , err := tools .ExecCommandWithOutput (ctx , p .dockerClient , containerID , types.ExecConfig {
780
+ Cmd : extractionCommand ,
781
+ User : defaults .Username ,
782
+ })
783
+
784
+ log .Msg ("Extracted last replay timestamp: " , output )
785
+
786
+ return output , err
787
+ }
788
+
758
789
func getCheckPointTimestamp (ctx context.Context , r io.Reader ) (string , error ) {
759
790
scanner := bufio .NewScanner (r )
760
791
checkpointTitleBytes := []byte (checkpointTimestampLabel )
0 commit comments