Skip to content

Commit 7548068

Browse files
authored
Preserve ingester state on restart (#6301)
* Expand token file to record previous state before shutting down and join ring with previous state Signed-off-by: Alex Le <[email protected]> * Added logs and unit tests Signed-off-by: Alex Le <[email protected]> * fix test and add instance state serialization code Signed-off-by: Alex Le <[email protected]> * update comments Signed-off-by: Alex Le <[email protected]> * addressed comments and added compatibility tests Signed-off-by: Alex Le <[email protected]> --------- Signed-off-by: Alex Le <[email protected]>
1 parent 661f47b commit 7548068

10 files changed

+279
-137
lines changed

pkg/compactor/compactor_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,7 @@ func removeIgnoredLogs(input []string) []string {
14761476

14771477
ignoredLogStringsRegexList := []*regexp.Regexp{
14781478
regexp.MustCompile(`^level=(info|debug|warn) component=cleaner .+$`),
1479+
regexp.MustCompile(`^level=info component=compactor msg="set state" .+$`),
14791480
}
14801481

14811482
out := make([]string, 0, len(input))

pkg/ring/basic_lifecycler_delegates.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,15 @@ func (d *TokensPersistencyDelegate) OnRingInstanceRegister(lifecycler *BasicLife
7070
return d.next.OnRingInstanceRegister(lifecycler, ringDesc, instanceExists, instanceID, instanceDesc)
7171
}
7272

73-
tokensFromFile, err := LoadTokensFromFile(d.tokensPath)
73+
tokenFile, err := LoadTokenFile(d.tokensPath)
7474
if err != nil {
7575
if !os.IsNotExist(err) {
7676
level.Error(d.logger).Log("msg", "error loading tokens from file", "err", err)
7777
}
7878

7979
return d.next.OnRingInstanceRegister(lifecycler, ringDesc, instanceExists, instanceID, instanceDesc)
8080
}
81+
tokensFromFile := tokenFile.Tokens
8182

8283
// Signal the next delegate that the tokens have been loaded, miming the
8384
// case the instance exist in the ring (which is OK because the lifecycler
@@ -94,7 +95,8 @@ func (d *TokensPersistencyDelegate) OnRingInstanceRegister(lifecycler *BasicLife
9495

9596
func (d *TokensPersistencyDelegate) OnRingInstanceTokens(lifecycler *BasicLifecycler, tokens Tokens) {
9697
if d.tokensPath != "" {
97-
if err := tokens.StoreToFile(d.tokensPath); err != nil {
98+
tokenFile := TokenFile{Tokens: tokens}
99+
if err := tokenFile.StoreToFile(d.tokensPath); err != nil {
98100
level.Error(d.logger).Log("msg", "error storing tokens to disk", "path", d.tokensPath, "err", err)
99101
}
100102
}

pkg/ring/basic_lifecycler_delegates_test.go

+11-9
Original file line numberDiff line numberDiff line change
@@ -69,22 +69,23 @@ func TestTokensPersistencyDelegate_ShouldSkipTokensLoadingIfFileDoesNotExist(t *
6969
require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler))
7070

7171
// Ensure tokens have been stored.
72-
actualTokens, err := LoadTokensFromFile(tokensFile.Name())
72+
tokenFile, err := LoadTokenFile(tokensFile.Name())
7373
require.NoError(t, err)
74-
assert.Equal(t, Tokens{1, 2, 3, 4, 5}, actualTokens)
74+
assert.Equal(t, Tokens{1, 2, 3, 4, 5}, tokenFile.Tokens)
7575

7676
// Ensure no error has been logged.
7777
assert.Empty(t, logs.String())
7878
}
7979

80-
func TestTokensPersistencyDelegate_ShouldLoadTokensFromFileIfFileExist(t *testing.T) {
80+
func TestTokensPersistencyDelegate_ShouldLoadTokenFileIfFileExist(t *testing.T) {
8181
tokensFile, err := os.CreateTemp("", "tokens-*")
8282
require.NoError(t, err)
8383
defer os.Remove(tokensFile.Name()) //nolint:errcheck
8484

8585
// Store some tokens to the file.
8686
storedTokens := Tokens{6, 7, 8, 9, 10}
87-
require.NoError(t, storedTokens.StoreToFile(tokensFile.Name()))
87+
tokenFile1 := TokenFile{Tokens: storedTokens}
88+
require.NoError(t, tokenFile1.StoreToFile(tokensFile.Name()))
8889

8990
testDelegate := &mockDelegate{
9091
onRegister: func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) {
@@ -113,9 +114,9 @@ func TestTokensPersistencyDelegate_ShouldLoadTokensFromFileIfFileExist(t *testin
113114
require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler))
114115

115116
// Ensure we can still read back the tokens file.
116-
actualTokens, err := LoadTokensFromFile(tokensFile.Name())
117+
tokenFile, err := LoadTokenFile(tokensFile.Name())
117118
require.NoError(t, err)
118-
assert.Equal(t, storedTokens, actualTokens)
119+
assert.Equal(t, storedTokens, tokenFile.Tokens)
119120
}
120121

121122
func TestTokensPersistencyDelegate_ShouldHandleTheCaseTheInstanceIsAlreadyInTheRing(t *testing.T) {
@@ -150,7 +151,8 @@ func TestTokensPersistencyDelegate_ShouldHandleTheCaseTheInstanceIsAlreadyInTheR
150151
defer os.Remove(tokensFile.Name()) //nolint:errcheck
151152

152153
// Store some tokens to the file.
153-
require.NoError(t, storedTokens.StoreToFile(tokensFile.Name()))
154+
tokenFile1 := TokenFile{Tokens: storedTokens}
155+
require.NoError(t, tokenFile1.StoreToFile(tokensFile.Name()))
154156

155157
// We assume is already registered to the ring.
156158
registeredAt := time.Now().Add(-time.Hour)
@@ -226,9 +228,9 @@ func TestDelegatesChain(t *testing.T) {
226228
assert.True(t, onStoppingCalled)
227229

228230
// Ensure tokens have been stored.
229-
actualTokens, err := LoadTokensFromFile(tokensFile.Name())
231+
tokenFile, err := LoadTokenFile(tokensFile.Name())
230232
require.NoError(t, err)
231-
assert.Equal(t, Tokens{1, 2, 3, 4, 5}, actualTokens)
233+
assert.Equal(t, Tokens{1, 2, 3, 4, 5}, tokenFile.Tokens)
232234
}
233235

234236
func TestAutoForgetDelegate(t *testing.T) {

pkg/ring/lifecycler.go

+71-17
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ type Lifecycler struct {
130130
// goes away and comes back empty. The state changes during lifecycle of instance.
131131
stateMtx sync.RWMutex
132132
state InstanceState
133-
tokens Tokens
133+
tokenFile *TokenFile
134134
registeredAt time.Time
135135

136136
// Controls the ready-reporting
@@ -205,6 +205,7 @@ func NewLifecycler(
205205
actorChan: make(chan func()),
206206
autojoinChan: make(chan struct{}, 1),
207207
state: PENDING,
208+
tokenFile: &TokenFile{PreviousState: ACTIVE},
208209
lifecyclerMetrics: NewLifecyclerMetrics(ringName, reg),
209210
logger: logger,
210211
tg: tg,
@@ -301,6 +302,7 @@ func (i *Lifecycler) GetState() InstanceState {
301302
func (i *Lifecycler) setState(state InstanceState) {
302303
i.stateMtx.Lock()
303304
defer i.stateMtx.Unlock()
305+
level.Info(i.logger).Log("msg", "set state", "old_state", i.state, "new_state", state)
304306
i.state = state
305307
}
306308

@@ -334,7 +336,7 @@ func (i *Lifecycler) ChangeState(ctx context.Context, state InstanceState) error
334336
func (i *Lifecycler) getTokens() Tokens {
335337
i.stateMtx.RLock()
336338
defer i.stateMtx.RUnlock()
337-
return i.tokens
339+
return i.tokenFile.Tokens
338340
}
339341

340342
func (i *Lifecycler) setTokens(tokens Tokens) {
@@ -343,14 +345,54 @@ func (i *Lifecycler) setTokens(tokens Tokens) {
343345
i.stateMtx.Lock()
344346
defer i.stateMtx.Unlock()
345347

346-
i.tokens = tokens
348+
i.tokenFile.Tokens = tokens
347349
if i.cfg.TokensFilePath != "" {
348-
if err := i.tokens.StoreToFile(i.cfg.TokensFilePath); err != nil {
350+
if err := i.tokenFile.StoreToFile(i.cfg.TokensFilePath); err != nil {
349351
level.Error(i.logger).Log("msg", "error storing tokens to disk", "path", i.cfg.TokensFilePath, "err", err)
350352
}
351353
}
352354
}
353355

356+
func (i *Lifecycler) getPreviousState() InstanceState {
357+
i.stateMtx.RLock()
358+
defer i.stateMtx.RUnlock()
359+
return i.tokenFile.PreviousState
360+
}
361+
362+
func (i *Lifecycler) setPreviousState(state InstanceState) {
363+
i.stateMtx.Lock()
364+
defer i.stateMtx.Unlock()
365+
366+
if !(state == ACTIVE || state == READONLY) {
367+
level.Error(i.logger).Log("msg", "cannot store unsupported state to disk", "new_state", state, "old_state", i.tokenFile.PreviousState)
368+
return
369+
}
370+
371+
i.tokenFile.PreviousState = state
372+
if i.cfg.TokensFilePath != "" {
373+
if err := i.tokenFile.StoreToFile(i.cfg.TokensFilePath); err != nil {
374+
level.Error(i.logger).Log("msg", "error storing state to disk", "path", i.cfg.TokensFilePath, "err", err)
375+
} else {
376+
level.Info(i.logger).Log("msg", "saved state to disk", "state", state, "path", i.cfg.TokensFilePath)
377+
}
378+
}
379+
}
380+
381+
func (i *Lifecycler) loadTokenFile() (*TokenFile, error) {
382+
383+
t, err := LoadTokenFile(i.cfg.TokensFilePath)
384+
if err != nil {
385+
return nil, err
386+
}
387+
388+
i.stateMtx.Lock()
389+
defer i.stateMtx.Unlock()
390+
391+
i.tokenFile = t
392+
level.Info(i.logger).Log("msg", "loaded token file", "state", i.tokenFile.PreviousState, "num_tokens", len(i.tokenFile.Tokens), "path", i.cfg.TokensFilePath)
393+
return i.tokenFile, nil
394+
}
395+
354396
func (i *Lifecycler) getRegisteredAt() time.Time {
355397
i.stateMtx.RLock()
356398
defer i.stateMtx.RUnlock()
@@ -501,8 +543,8 @@ func (i *Lifecycler) loop(ctx context.Context) error {
501543
level.Info(i.logger).Log("msg", "observing tokens before going ACTIVE", "ring", i.RingName)
502544
observeChan = time.After(i.cfg.ObservePeriod)
503545
} else {
504-
if err := i.autoJoin(context.Background(), ACTIVE); err != nil {
505-
return errors.Wrapf(err, "failed to pick tokens in the KV store, ring: %s", i.RingName)
546+
if err := i.autoJoin(context.Background(), i.getPreviousState()); err != nil {
547+
return errors.Wrapf(err, "failed to pick tokens in the KV store, ring: %s, state: %s", i.RingName, i.getPreviousState())
506548
}
507549
}
508550
}
@@ -519,9 +561,9 @@ func (i *Lifecycler) loop(ctx context.Context) error {
519561
if i.verifyTokens(context.Background()) {
520562
level.Info(i.logger).Log("msg", "token verification successful", "ring", i.RingName)
521563

522-
err := i.changeState(context.Background(), ACTIVE)
564+
err := i.changeState(context.Background(), i.getPreviousState())
523565
if err != nil {
524-
level.Error(i.logger).Log("msg", "failed to set state to ACTIVE", "ring", i.RingName, "err", err)
566+
level.Error(i.logger).Log("msg", "failed to set state", "ring", i.RingName, "state", i.getPreviousState(), "err", err)
525567
}
526568
} else {
527569
level.Info(i.logger).Log("msg", "token verification failed, observing", "ring", i.RingName)
@@ -564,6 +606,12 @@ func (i *Lifecycler) stopping(runningError error) error {
564606
heartbeatTickerStop, heartbeatTickerChan := newDisableableTicker(i.cfg.HeartbeatPeriod)
565607
defer heartbeatTickerStop()
566608

609+
// save current state into file
610+
if i.cfg.TokensFilePath != "" {
611+
currentState := i.GetState()
612+
i.setPreviousState(currentState)
613+
}
614+
567615
// Mark ourselved as Leaving so no more samples are send to us.
568616
err := i.changeState(context.Background(), LEAVING)
569617
if err != nil {
@@ -613,9 +661,13 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
613661
)
614662

615663
if i.cfg.TokensFilePath != "" {
616-
tokensFromFile, err = LoadTokensFromFile(i.cfg.TokensFilePath)
664+
tokenFile, err := i.loadTokenFile()
617665
if err != nil && !os.IsNotExist(err) {
618-
level.Error(i.logger).Log("msg", "error loading tokens from file", "err", err)
666+
level.Error(i.logger).Log("msg", "error loading tokens and previous state from file", "err", err)
667+
}
668+
669+
if tokenFile != nil {
670+
tokensFromFile = tokenFile.Tokens
619671
}
620672
} else {
621673
level.Info(i.logger).Log("msg", "not loading tokens from file, tokens file path is empty")
@@ -639,7 +691,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
639691
if len(tokensFromFile) > 0 {
640692
level.Info(i.logger).Log("msg", "adding tokens from file", "num_tokens", len(tokensFromFile))
641693
if len(tokensFromFile) >= i.cfg.NumTokens && i.autoJoinOnStartup {
642-
i.setState(ACTIVE)
694+
i.setState(i.getPreviousState())
643695
}
644696
ringDesc.AddIngester(i.ID, i.Addr, i.Zone, tokensFromFile, i.GetState(), registeredAt)
645697
i.setTokens(tokensFromFile)
@@ -669,11 +721,11 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
669721

670722
// If the ingester failed to clean its ring entry up in can leave its state in LEAVING
671723
// OR unregister_on_shutdown=false
672-
// if autoJoinOnStartup, move it into ACTIVE to ensure the ingester joins the ring.
673-
// else set to PENDING
724+
// if autoJoinOnStartup, move it into previous state based on token file (default: ACTIVE)
725+
// to ensure the ingester joins the ring. else set to PENDING
674726
if instanceDesc.State == LEAVING && len(instanceDesc.Tokens) != 0 {
675727
if i.autoJoinOnStartup {
676-
instanceDesc.State = ACTIVE
728+
instanceDesc.State = i.getPreviousState()
677729
} else {
678730
instanceDesc.State = PENDING
679731
}
@@ -908,10 +960,12 @@ func (i *Lifecycler) updateConsul(ctx context.Context) error {
908960
func (i *Lifecycler) changeState(ctx context.Context, state InstanceState) error {
909961
currState := i.GetState()
910962
// Only the following state transitions can be triggered externally
911-
if !((currState == PENDING && state == JOINING) || // triggered by TransferChunks at the beginning
912-
(currState == JOINING && state == PENDING) || // triggered by TransferChunks on failure
913-
(currState == JOINING && state == ACTIVE) || // triggered by TransferChunks on success
963+
if !((currState == PENDING && state == JOINING) ||
964+
(currState == JOINING && state == PENDING) ||
965+
(currState == JOINING && state == ACTIVE) ||
966+
(currState == JOINING && state == READONLY) ||
914967
(currState == PENDING && state == ACTIVE) || // triggered by autoJoin
968+
(currState == PENDING && state == READONLY) || // triggered by autoJoin
915969
(currState == ACTIVE && state == LEAVING) || // triggered by shutdown
916970
(currState == ACTIVE && state == READONLY) || // triggered by ingester mode
917971
(currState == READONLY && state == ACTIVE) || // triggered by ingester mode

pkg/ring/lifecycler_test.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,7 @@ func TestRestartIngester_DisabledHeartbeat_unregister_on_shutdown_false(t *testi
716716
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2))
717717
}
718718

719-
func TestTokensOnDisk(t *testing.T) {
719+
func TestTokenFileOnDisk(t *testing.T) {
720720
ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil)
721721
t.Cleanup(func() { assert.NoError(t, closer.Close()) })
722722

@@ -756,6 +756,18 @@ func TestTokensOnDisk(t *testing.T) {
756756
len(desc.Ingesters["ing1"].Tokens) == 512
757757
})
758758

759+
// Change state from ACTIVE to READONLY
760+
err = l1.ChangeState(context.Background(), READONLY)
761+
require.NoError(t, err)
762+
test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
763+
d, err := r.KVClient.Get(context.Background(), ringKey)
764+
require.NoError(t, err)
765+
766+
desc, ok := d.(*Desc)
767+
return ok &&
768+
desc.Ingesters["ing1"].State == READONLY
769+
})
770+
759771
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l1))
760772

761773
// Start new ingester at same token directory.
@@ -776,7 +788,7 @@ func TestTokensOnDisk(t *testing.T) {
776788
}
777789
return ok &&
778790
len(desc.Ingesters) == 1 &&
779-
desc.Ingesters["ing2"].State == ACTIVE &&
791+
desc.Ingesters["ing2"].State == READONLY &&
780792
len(desc.Ingesters["ing2"].Tokens) == 512
781793
})
782794

pkg/ring/token_file.go

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package ring
2+
3+
import (
4+
"encoding/json"
5+
"errors"
6+
"os"
7+
"sort"
8+
)
9+
10+
type TokenFile struct {
11+
PreviousState InstanceState `json:"previousState,omitempty"`
12+
Tokens Tokens `json:"tokens"`
13+
}
14+
15+
// StoreToFile stores the tokens in the given directory.
16+
func (l TokenFile) StoreToFile(tokenFilePath string) error {
17+
if tokenFilePath == "" {
18+
return errors.New("path is empty")
19+
}
20+
21+
// If any operations failed further in the function, we keep the temporary
22+
// file hanging around for debugging.
23+
f, err := os.Create(tokenFilePath + ".tmp")
24+
if err != nil {
25+
return err
26+
}
27+
28+
defer func() {
29+
// If the file was not closed, then there must already be an error, hence ignore
30+
// the error (if any) from f.Close(). If the file was already closed, then
31+
// we would ignore the error in that case too.
32+
_ = f.Close()
33+
}()
34+
35+
b, err := json.Marshal(l)
36+
if err != nil {
37+
return err
38+
}
39+
if _, err = f.Write(b); err != nil {
40+
return err
41+
}
42+
43+
if err := f.Close(); err != nil {
44+
return err
45+
}
46+
47+
// Tokens successfully written, replace the temporary file with the actual file path.
48+
return os.Rename(f.Name(), tokenFilePath)
49+
}
50+
51+
func LoadTokenFile(tokenFilePath string) (*TokenFile, error) {
52+
b, err := os.ReadFile(tokenFilePath)
53+
if err != nil {
54+
return nil, err
55+
}
56+
t := TokenFile{}
57+
err = json.Unmarshal(b, &t)
58+
59+
// Tokens may have been written to file by an older version which
60+
// doesn't guarantee sorted tokens, so we enforce sorting here.
61+
if !sort.IsSorted(t.Tokens) {
62+
sort.Sort(t.Tokens)
63+
}
64+
65+
return &t, err
66+
}
67+
68+
func (p InstanceState) MarshalJSON() ([]byte, error) {
69+
ss := InstanceState_name[int32(p)]
70+
return json.Marshal(ss)
71+
}
72+
func (p *InstanceState) UnmarshalJSON(data []byte) error {
73+
res := ""
74+
if err := json.Unmarshal(data, &res); err != nil {
75+
return err
76+
}
77+
*p = InstanceState(InstanceState_value[res])
78+
return nil
79+
}

0 commit comments

Comments
 (0)