Skip to content

Commit 9f02811

Browse files
committed
wal: validate recovery directory through a stable identifier
We now persist a stable identifier to both the OPTIONS file and a file within the secondary directory (failover_identifier). If recovery finds that the secondary directory does not contain a matching identifier, we abort recovery indicating that the secondary seems incorrect / corrupt. Fixes: #4416
1 parent 392f102 commit 9f02811

File tree

9 files changed

+435
-3
lines changed

9 files changed

+435
-3
lines changed

open.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ func Open(dirname string, opts *Options) (db *DB, err error) {
350350
}
351351
if opts.WALFailover != nil {
352352
walOpts.Secondary = opts.WALFailover.Secondary
353+
walOpts.Secondary.ID = opts.WALFailover.Secondary.ID
353354
// Lock the secondary WAL directory, if distinct from the data directory
354355
// and primary WAL directory.
355356
if secondaryWalDirName != dirname && secondaryWalDirName != walDirname {
@@ -424,10 +425,19 @@ func Open(dirname string, opts *Options) (db *DB, err error) {
424425
}
425426
}
426427

428+
if opts.WALFailover != nil {
429+
walDir, err := wal.ValidateOrInitWALDir(walOpts.Secondary)
430+
if err != nil {
431+
return nil, err
432+
}
433+
walOpts.Secondary = walDir
434+
opts.WALFailover.Secondary.ID = walDir.ID
435+
}
427436
walManager, err := wal.Init(walOpts, retainedWALs)
428437
if err != nil {
429438
return nil, err
430439
}
440+
431441
defer maybeCleanUp(walManager.Close)
432442
d.mu.log.manager = walManager
433443

open_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ func TestOpen_WALFailover(t *testing.T) {
256256
if o.FS == nil {
257257
return "no path"
258258
}
259+
// Set a constant identifier for testing to avoid flaky tests
260+
wal.SetGenerateStableIdentifierForTesting("9f69f2c3ffb3c247767290a9b3215fc5")
261+
defer wal.ResetGenerateStableIdentifierForTesting()
259262
d, err := Open(dataDir, o)
260263
if err != nil {
261264
return err.Error()

options.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
"github.com/cockroachdb/crlib/fifo"
2121
"github.com/cockroachdb/errors"
22+
"github.com/cockroachdb/errors/oserror"
2223
"github.com/cockroachdb/pebble/internal/base"
2324
"github.com/cockroachdb/pebble/internal/cache"
2425
"github.com/cockroachdb/pebble/internal/humanize"
@@ -1863,6 +1864,9 @@ func (o *Options) String() string {
18631864
fmt.Fprintf(&buf, "\n")
18641865
fmt.Fprintf(&buf, "[WAL Failover]\n")
18651866
fmt.Fprintf(&buf, " secondary_dir=%s\n", o.WALFailover.Secondary.Dirname)
1867+
if o.WALFailover.Secondary.ID != "" {
1868+
fmt.Fprintf(&buf, " secondary_identifier=%s\n", o.WALFailover.Secondary.ID)
1869+
}
18661870
fmt.Fprintf(&buf, " primary_dir_probe_interval=%s\n", o.WALFailover.FailoverOptions.PrimaryDirProbeInterval)
18671871
fmt.Fprintf(&buf, " healthy_probe_latency_threshold=%s\n", o.WALFailover.FailoverOptions.HealthyProbeLatencyThreshold)
18681872
fmt.Fprintf(&buf, " healthy_interval=%s\n", o.WALFailover.FailoverOptions.HealthyInterval)
@@ -2317,6 +2321,8 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error {
23172321
switch key {
23182322
case "secondary_dir":
23192323
o.WALFailover.Secondary = wal.Dir{Dirname: value, FS: vfs.Default}
2324+
case "secondary_identifier":
2325+
o.WALFailover.Secondary.ID = value
23202326
case "primary_dir_probe_interval":
23212327
o.WALFailover.PrimaryDirProbeInterval, err = time.ParseDuration(value)
23222328
case "healthy_probe_latency_threshold":
@@ -2435,6 +2441,21 @@ func (e ErrMissingWALRecoveryDir) Error() string {
24352441
return fmt.Sprintf("directory %q may contain relevant WALs but is not in WALRecoveryDirs%s", e.Dir, e.ExtraInfo)
24362442
}
24372443

2444+
// ErrSecondaryIdentifierMismatch is an error returned when the secondary directory
2445+
// identifier doesn't match the expected identifier, indicating the wrong disk
2446+
// may have been mounted at the expected path.
2447+
type ErrSecondaryIdentifierMismatch struct {
2448+
ExpectedIdentifier string
2449+
ActualIdentifier string
2450+
SecondaryDir string
2451+
}
2452+
2453+
// Error implements error.
2454+
func (e ErrSecondaryIdentifierMismatch) Error() string {
2455+
return fmt.Sprintf("secondary directory %q has identifier %q but expected %q - wrong disk may be mounted",
2456+
e.SecondaryDir, e.ActualIdentifier, e.ExpectedIdentifier)
2457+
}
2458+
24382459
// CheckCompatibility verifies the options are compatible with the previous options
24392460
// serialized by Options.String(). For example, the Comparer and Merger must be
24402461
// the same, or data will not be able to be properly read from the DB.
@@ -2502,6 +2523,12 @@ func (o *Options) checkWALDir(storeDir, walDir, errContext string) error {
25022523
for _, d := range o.WALRecoveryDirs {
25032524
// TODO(radu): should we also check that d.FS is the same as walDir's FS?
25042525
if walPath == resolveStorePath(storeDir, d.Dirname) {
2526+
if d.ID != "" {
2527+
if err := o.validateWALRecoveryDirIdentifier(d); err != nil {
2528+
return err
2529+
}
2530+
2531+
}
25052532
return nil
25062533
}
25072534
}
@@ -2789,3 +2816,33 @@ func resolveStorePath(storeDir, path string) string {
27892816
}
27902817
return path
27912818
}
2819+
2820+
// validateWALRecoveryDirIdentifier validates that the identifier in the
2821+
// provided wal.Dir matches the expected ID encoded in the OPTIONS file to
2822+
// ensure that we're using the correct directory.
2823+
func (o *Options) validateWALRecoveryDirIdentifier(d wal.Dir) error {
2824+
identifierFile := d.FS.PathJoin(d.Dirname, "failover_identifier")
2825+
f, err := d.FS.Open(identifierFile)
2826+
if err != nil {
2827+
if oserror.IsNotExist(err) {
2828+
return nil
2829+
}
2830+
return err
2831+
}
2832+
defer f.Close()
2833+
2834+
existingIdentifier, err := io.ReadAll(f)
2835+
if err != nil {
2836+
return err
2837+
}
2838+
2839+
if err != nil {
2840+
return errors.Newf("failed to read secondary identifier from WALRecoveryDir %q: %v",
2841+
d.Dirname, err)
2842+
}
2843+
if strings.TrimSpace(string(existingIdentifier)) != d.ID {
2844+
return errors.Newf("WALRecoveryDir %q has identifier %q but expected %q",
2845+
d.Dirname, existingIdentifier, d.ID)
2846+
}
2847+
return nil
2848+
}

options_test.go

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package pebble
77
import (
88
"bytes"
99
"fmt"
10+
"io"
1011
"math/rand/v2"
1112
"runtime"
1213
"strings"
@@ -259,7 +260,7 @@ func TestOptionsCheckCompatibility(t *testing.T) {
259260

260261
// Check that an OPTIONS file that configured an explicit WALDir that will
261262
// no longer be used errors if it's not also present in WALRecoveryDirs.
262-
//require.Equal(t, ErrMissingWALRecoveryDir{Dir: "external-wal-dir"},
263+
// require.Equal(t, ErrMissingWALRecoveryDir{Dir: "external-wal-dir"},
263264
err := DefaultOptions().CheckCompatibility(storeDir, `
264265
[Options]
265266
wal_dir=external-wal-dir
@@ -339,6 +340,52 @@ func TestOptionsCheckCompatibility(t *testing.T) {
339340
`))
340341
}
341342

343+
func TestWALRecoveryDirValidation(t *testing.T) {
344+
storeDir := "/mnt/foo"
345+
mem := vfs.NewMem()
346+
recoveryDir := "/mnt/wrong-disk-dir"
347+
err := mem.MkdirAll(recoveryDir, 0755)
348+
require.NoError(t, err)
349+
350+
// Create failover_identifier file with different ID.
351+
identifierFile := mem.PathJoin(recoveryDir, "failover_identifier")
352+
wrongID := "11111111111111111111111111111111"
353+
err = writeTestIdentifierToFile(mem, identifierFile, wrongID)
354+
require.NoError(t, err)
355+
356+
opts := &Options{
357+
FS: mem,
358+
WALRecoveryDirs: []wal.Dir{
359+
{
360+
FS: mem,
361+
Dirname: recoveryDir,
362+
ID: "22222222222222222222222222222222",
363+
},
364+
},
365+
}
366+
opts.EnsureDefaults()
367+
368+
err = opts.checkWALDir(storeDir, recoveryDir, "test context")
369+
require.Error(t, err)
370+
require.Contains(t, err.Error(), "has identifier \"11111111111111111111111111111111\" but expected \"22222222222222222222222222222222\"")
371+
}
372+
373+
// writeTestIdentifierToFile is a helper function to write an identifier to a file
374+
func writeTestIdentifierToFile(fs vfs.FS, filename, identifier string) error {
375+
f, err := fs.Create(filename, "pebble-wal")
376+
if err != nil {
377+
return err
378+
}
379+
defer f.Close()
380+
381+
_, err = io.WriteString(f, identifier)
382+
if err != nil {
383+
return err
384+
}
385+
386+
return f.Sync()
387+
}
388+
342389
type testCleaner struct{}
343390

344391
func (testCleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error {

testdata/open_wal_failover

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ list path=(a,data)
4646
grep-between path=(a,data/OPTIONS-000007) start=(\[WAL Failover\]) end=^$
4747
----
4848
secondary_dir=secondary-wals
49+
secondary_identifier=9f69f2c3ffb3c247767290a9b3215fc5
4950
primary_dir_probe_interval=1s
5051
healthy_probe_latency_threshold=25ms
5152
healthy_interval=15s

wal/failover_manager.go

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,18 @@ package wal
66

77
import (
88
"cmp"
9+
crand "crypto/rand"
910
"fmt"
1011
"io"
11-
"math/rand/v2"
12+
mathrand "math/rand/v2"
1213
"os"
1314
"slices"
15+
"strings"
1416
"sync"
1517
"time"
1618

1719
"github.com/cockroachdb/errors"
20+
"github.com/cockroachdb/errors/oserror"
1821
"github.com/cockroachdb/pebble/internal/base"
1922
"github.com/cockroachdb/pebble/internal/invariants"
2023
"github.com/cockroachdb/pebble/vfs"
@@ -52,6 +55,71 @@ const probeHistoryLength = 128
5255
// Large value.
5356
const failedProbeDuration = 24 * 60 * 60 * time.Second
5457

58+
// For testing, generateStableIdentifierForTesting can be overridden to return
59+
// a constant value when we generate stable identifiers.
60+
var generateStableIdentifierForTesting = ""
61+
62+
// SetGenerateStableIdentifierForTesting sets a constant identifier for testing.
63+
// This should only be used in tests to avoid flaky behavior.
64+
func SetGenerateStableIdentifierForTesting(identifier string) {
65+
generateStableIdentifierForTesting = identifier
66+
}
67+
68+
// ResetGenerateStableIdentifierForTesting resets the testing override.
69+
// This should only be used in tests.
70+
func ResetGenerateStableIdentifierForTesting() {
71+
generateStableIdentifierForTesting = ""
72+
}
73+
74+
// generateStableIdentifier generates a random hex string from 16 bytes.
75+
func generateStableIdentifier() (string, error) {
76+
// For testing, return a constant value if set.
77+
if generateStableIdentifierForTesting != "" {
78+
return generateStableIdentifierForTesting, nil
79+
}
80+
81+
var uuid [16]byte
82+
if _, err := crand.Read(uuid[:]); err != nil {
83+
return "", err
84+
}
85+
return fmt.Sprintf("%x", uuid), nil
86+
}
87+
88+
// readSecondaryIdentifier reads the identifier from the secondary directory.
89+
func readSecondaryIdentifier(fs vfs.FS, identifierFile string) (string, error) {
90+
f, err := fs.Open(identifierFile)
91+
if err != nil {
92+
if oserror.IsNotExist(err) {
93+
return "", nil
94+
}
95+
return "", err
96+
}
97+
defer f.Close()
98+
99+
data, err := io.ReadAll(f)
100+
if err != nil {
101+
return "", err
102+
}
103+
104+
// Trim whitespace and return the identifier.
105+
return strings.TrimSpace(string(data)), nil
106+
}
107+
108+
// writeSecondaryIdentifier writes the identifier to the secondary directory.
109+
func writeSecondaryIdentifier(fs vfs.FS, identifierFile string, identifier string) error {
110+
f, err := fs.Create(identifierFile, "pebble-wal")
111+
if err != nil {
112+
return err
113+
}
114+
115+
if _, err := io.WriteString(f, identifier); err != nil {
116+
f.Close()
117+
return err
118+
}
119+
120+
return errors.CombineErrors(f.Sync(), f.Close())
121+
}
122+
55123
// init takes a stopper in order to connect the dirProber's long-running
56124
// goroutines with the stopper's wait group, but the dirProber has its own
57125
// stop() method that should be invoked to trigger the shutdown.
@@ -73,7 +141,7 @@ func (p *dirProber) init(
73141
}
74142
// Random bytes for writing, to defeat any FS compression optimization.
75143
for i := range p.buf {
76-
p.buf[i] = byte(rand.Uint32())
144+
p.buf[i] = byte(mathrand.Uint32())
77145
}
78146
// dirProber has an explicit stop() method instead of listening on
79147
// stopper.shouldQuiesce. This structure helps negotiate the shutdown
@@ -538,6 +606,46 @@ func (wm *failoverManager) init(o Options, initial Logs) error {
538606
return nil
539607
}
540608

609+
// ValidateOrInitWALDir manages the secondary directory identifier for
610+
// failover validation. It ensures the correct secondary directory is mounted
611+
// by validating or generating a stable identifier.
612+
func ValidateOrInitWALDir(walDir Dir) (Dir, error) {
613+
identifierFile := walDir.FS.PathJoin(walDir.Dirname, "failover_identifier")
614+
// If we have an identifier from the OPTIONS file, validate it matches what's
615+
// in the directory.
616+
if walDir.ID != "" {
617+
existingIdentifier, err := readSecondaryIdentifier(walDir.FS, identifierFile)
618+
if err != nil {
619+
return Dir{}, errors.Newf("failed to read secondary identifier: %v", err)
620+
}
621+
// Not the same identifier, wrong disk may be mounted.
622+
if existingIdentifier != walDir.ID {
623+
return Dir{}, errors.Newf("secondary directory %q has identifier %q but expected %q - wrong disk may be mounted",
624+
walDir.Dirname, existingIdentifier, walDir.ID)
625+
}
626+
} else {
627+
// No identifier in OPTIONS file, check if one exists in the directory.
628+
existingIdentifier, err := readSecondaryIdentifier(walDir.FS, identifierFile)
629+
if err != nil {
630+
return Dir{}, errors.Newf("failed to read secondary identifier: %v", err)
631+
}
632+
if existingIdentifier == "" {
633+
// Generate a new identifier.
634+
identifier, err := generateStableIdentifier()
635+
if err != nil {
636+
return Dir{}, errors.Newf("failed to generate UUID: %v", err)
637+
}
638+
if err := writeSecondaryIdentifier(walDir.FS, identifierFile, identifier); err != nil {
639+
return Dir{}, errors.Newf("failed to write secondary identifier: %v", err)
640+
}
641+
walDir.ID = identifier
642+
} else {
643+
walDir.ID = existingIdentifier
644+
}
645+
}
646+
return walDir, nil
647+
}
648+
541649
// List implements Manager.
542650
func (wm *failoverManager) List() Logs {
543651
wm.mu.Lock()
@@ -843,6 +951,11 @@ func (wm *failoverManager) logCreator(
843951
return logFile, 0, err
844952
}
845953

954+
// Opts implements Manager.
955+
func (wm *failoverManager) Opts() Options {
956+
return wm.opts
957+
}
958+
846959
type stopper struct {
847960
quiescer chan struct{} // Closed when quiescing
848961
wg sync.WaitGroup

wal/standalone_manager.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,11 @@ func (m *StandaloneManager) Close() error {
257257
return err
258258
}
259259

260+
// Opts implements Manager.
261+
func (m *StandaloneManager) Opts() Options {
262+
return m.o
263+
}
264+
260265
// RecyclerForTesting implements Manager.
261266
func (m *StandaloneManager) RecyclerForTesting() *LogRecycler {
262267
return &m.recycler

0 commit comments

Comments
 (0)