Skip to content

Commit 0b8c593

Browse files
authored
Add querier.ingester-query-max-attempts to retry on partial data. (#6714)
* Add static retry Signed-off-by: Justin Jung <[email protected]> * Add querier.ingester-query-max-attempts Signed-off-by: Justin Jung <[email protected]> * Add docs Signed-off-by: Justin Jung <[email protected]> * Add changelog Signed-off-by: Justin Jung <[email protected]> * Lint Signed-off-by: Justin Jung <[email protected]> * Use backoff Signed-off-by: Justin Jung <[email protected]> * Fix test Signed-off-by: Justin Jung <[email protected]> * Change default retry from 3 to 1 Signed-off-by: Justin Jung <[email protected]> * Reduce backoff time + update doc Signed-off-by: Justin Jung <[email protected]> --------- Signed-off-by: Justin Jung <[email protected]>
1 parent bd733d2 commit 0b8c593

File tree

7 files changed

+313
-40
lines changed

7 files changed

+313
-40
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
* [ENHANCEMENT] AlertManager: Add `keep_instance_in_the_ring_on_shutdown` and `tokens_file_path` configs for alertmanager ring. #6628
3131
* [ENHANCEMENT] Querier: Add metric and enhanced logging for query partial data. #6676
3232
* [ENHANCEMENT] Ingester: Push request should fail when label set is out of order #6746
33+
* [ENHANCEMENT] Querier: Add `querier.ingester-query-max-attempts` to retry on partial data. #6714
3334
* [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517
3435
* [BUGFIX] Ingester: Fix labelset data race condition. #6573
3536
* [BUGFIX] Compactor: Cleaner should not put deletion marker for blocks with no-compact marker. #6576

docs/blocks-storage/querier.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,11 @@ querier:
237237
# CLI flag: -querier.store-gateway-consistency-check-max-attempts
238238
[store_gateway_consistency_check_max_attempts: <int> | default = 3]
239239

240+
# The maximum number of times we attempt fetching data from ingesters for
241+
# retryable errors (ex. partial data returned).
242+
# CLI flag: -querier.ingester-query-max-attempts
243+
[ingester_query_max_attempts: <int> | default = 1]
244+
240245
# When distributor's sharding strategy is shuffle-sharding and this setting is
241246
# > 0, queriers fetch in-memory series from the minimum set of required
242247
# ingesters, selecting only ingesters which may have received series since

docs/configuration/config-file-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4174,6 +4174,11 @@ store_gateway_client:
41744174
# CLI flag: -querier.store-gateway-consistency-check-max-attempts
41754175
[store_gateway_consistency_check_max_attempts: <int> | default = 3]
41764176

4177+
# The maximum number of times we attempt fetching data from ingesters for
4178+
# retryable errors (ex. partial data returned).
4179+
# CLI flag: -querier.ingester-query-max-attempts
4180+
[ingester_query_max_attempts: <int> | default = 1]
4181+
41774182
# When distributor's sharding strategy is shuffle-sharding and this setting is >
41784183
# 0, queriers fetch in-memory series from the minimum set of required ingesters,
41794184
# selecting only ingesters which may have received series since 'now - lookback

pkg/querier/distributor_queryable.go

Lines changed: 109 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,14 @@ import (
2020
"github.com/cortexproject/cortex/pkg/querier/series"
2121
"github.com/cortexproject/cortex/pkg/tenant"
2222
"github.com/cortexproject/cortex/pkg/util"
23+
"github.com/cortexproject/cortex/pkg/util/backoff"
2324
"github.com/cortexproject/cortex/pkg/util/chunkcompat"
2425
"github.com/cortexproject/cortex/pkg/util/spanlogger"
2526
)
2627

28+
const retryMinBackoff = time.Millisecond
29+
const retryMaxBackoff = 5 * time.Millisecond
30+
2731
// Distributor is the read interface to the distributor, made an interface here
2832
// to reduce package coupling.
2933
type Distributor interface {
@@ -38,36 +42,39 @@ type Distributor interface {
3842
MetricsMetadata(ctx context.Context, req *client.MetricsMetadataRequest) ([]scrape.MetricMetadata, error)
3943
}
4044

41-
func newDistributorQueryable(distributor Distributor, streamingMetdata bool, labelNamesWithMatchers bool, iteratorFn chunkIteratorFunc, queryIngestersWithin time.Duration, isPartialDataEnabled partialdata.IsCfgEnabledFunc) QueryableWithFilter {
45+
func newDistributorQueryable(distributor Distributor, streamingMetdata bool, labelNamesWithMatchers bool, iteratorFn chunkIteratorFunc, queryIngestersWithin time.Duration, isPartialDataEnabled partialdata.IsCfgEnabledFunc, ingesterQueryMaxAttempts int) QueryableWithFilter {
4246
return distributorQueryable{
43-
distributor: distributor,
44-
streamingMetdata: streamingMetdata,
45-
labelNamesWithMatchers: labelNamesWithMatchers,
46-
iteratorFn: iteratorFn,
47-
queryIngestersWithin: queryIngestersWithin,
48-
isPartialDataEnabled: isPartialDataEnabled,
47+
distributor: distributor,
48+
streamingMetdata: streamingMetdata,
49+
labelNamesWithMatchers: labelNamesWithMatchers,
50+
iteratorFn: iteratorFn,
51+
queryIngestersWithin: queryIngestersWithin,
52+
isPartialDataEnabled: isPartialDataEnabled,
53+
ingesterQueryMaxAttempts: ingesterQueryMaxAttempts,
4954
}
5055
}
5156

5257
type distributorQueryable struct {
53-
distributor Distributor
54-
streamingMetdata bool
55-
labelNamesWithMatchers bool
56-
iteratorFn chunkIteratorFunc
57-
queryIngestersWithin time.Duration
58-
isPartialDataEnabled partialdata.IsCfgEnabledFunc
58+
distributor Distributor
59+
streamingMetdata bool
60+
labelNamesWithMatchers bool
61+
iteratorFn chunkIteratorFunc
62+
queryIngestersWithin time.Duration
63+
isPartialDataEnabled partialdata.IsCfgEnabledFunc
64+
ingesterQueryMaxAttempts int
5965
}
6066

6167
func (d distributorQueryable) Querier(mint, maxt int64) (storage.Querier, error) {
6268
return &distributorQuerier{
63-
distributor: d.distributor,
64-
mint: mint,
65-
maxt: maxt,
66-
streamingMetadata: d.streamingMetdata,
67-
labelNamesMatchers: d.labelNamesWithMatchers,
68-
chunkIterFn: d.iteratorFn,
69-
queryIngestersWithin: d.queryIngestersWithin,
70-
isPartialDataEnabled: d.isPartialDataEnabled,
69+
distributor: d.distributor,
70+
mint: mint,
71+
maxt: maxt,
72+
streamingMetadata: d.streamingMetdata,
73+
labelNamesMatchers: d.labelNamesWithMatchers,
74+
chunkIterFn: d.iteratorFn,
75+
queryIngestersWithin: d.queryIngestersWithin,
76+
isPartialDataEnabled: d.isPartialDataEnabled,
77+
ingesterQueryMaxAttempts: d.ingesterQueryMaxAttempts,
7178
}, nil
7279
}
7380

@@ -77,13 +84,14 @@ func (d distributorQueryable) UseQueryable(now time.Time, _, queryMaxT int64) bo
7784
}
7885

7986
type distributorQuerier struct {
80-
distributor Distributor
81-
mint, maxt int64
82-
streamingMetadata bool
83-
labelNamesMatchers bool
84-
chunkIterFn chunkIteratorFunc
85-
queryIngestersWithin time.Duration
86-
isPartialDataEnabled partialdata.IsCfgEnabledFunc
87+
distributor Distributor
88+
mint, maxt int64
89+
streamingMetadata bool
90+
labelNamesMatchers bool
91+
chunkIterFn chunkIteratorFunc
92+
queryIngestersWithin time.Duration
93+
isPartialDataEnabled partialdata.IsCfgEnabledFunc
94+
ingesterQueryMaxAttempts int
8795
}
8896

8997
// Select implements storage.Querier interface.
@@ -150,7 +158,9 @@ func (q *distributorQuerier) Select(ctx context.Context, sortSeries bool, sp *st
150158
}
151159

152160
func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, partialDataEnabled bool, minT, maxT int64, matchers []*labels.Matcher) storage.SeriesSet {
153-
results, err := q.distributor.QueryStream(ctx, model.Time(minT), model.Time(maxT), partialDataEnabled, matchers...)
161+
results, err := q.queryWithRetry(ctx, func() (*client.QueryStreamResponse, error) {
162+
return q.distributor.QueryStream(ctx, model.Time(minT), model.Time(maxT), partialDataEnabled, matchers...)
163+
})
154164

155165
if err != nil && !partialdata.IsPartialDataError(err) {
156166
return storage.ErrSeriesSet(err)
@@ -192,6 +202,33 @@ func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, pa
192202
return seriesSet
193203
}
194204

205+
func (q *distributorQuerier) queryWithRetry(ctx context.Context, queryFunc func() (*client.QueryStreamResponse, error)) (*client.QueryStreamResponse, error) {
206+
if q.ingesterQueryMaxAttempts <= 1 {
207+
return queryFunc()
208+
}
209+
210+
var result *client.QueryStreamResponse
211+
var err error
212+
213+
retries := backoff.New(ctx, backoff.Config{
214+
MinBackoff: retryMinBackoff,
215+
MaxBackoff: retryMaxBackoff,
216+
MaxRetries: q.ingesterQueryMaxAttempts,
217+
})
218+
219+
for retries.Ongoing() {
220+
result, err = queryFunc()
221+
222+
if err == nil || !q.isRetryableError(err) {
223+
return result, err
224+
}
225+
226+
retries.Wait()
227+
}
228+
229+
return result, err
230+
}
231+
195232
func (q *distributorQuerier) LabelValues(ctx context.Context, name string, hints *storage.LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
196233
var (
197234
lvs []string
@@ -201,9 +238,13 @@ func (q *distributorQuerier) LabelValues(ctx context.Context, name string, hints
201238
partialDataEnabled := q.partialDataEnabled(ctx)
202239

203240
if q.streamingMetadata {
204-
lvs, err = q.distributor.LabelValuesForLabelNameStream(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
241+
lvs, err = q.labelsWithRetry(ctx, func() ([]string, error) {
242+
return q.distributor.LabelValuesForLabelNameStream(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
243+
})
205244
} else {
206-
lvs, err = q.distributor.LabelValuesForLabelName(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
245+
lvs, err = q.labelsWithRetry(ctx, func() ([]string, error) {
246+
return q.distributor.LabelValuesForLabelName(ctx, model.Time(q.mint), model.Time(q.maxt), model.LabelName(name), hints, partialDataEnabled, matchers...)
247+
})
207248
}
208249

209250
if partialdata.IsPartialDataError(err) {
@@ -230,9 +271,13 @@ func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.Labe
230271
)
231272

232273
if q.streamingMetadata {
233-
ln, err = q.distributor.LabelNamesStream(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
274+
ln, err = q.labelsWithRetry(ctx, func() ([]string, error) {
275+
return q.distributor.LabelNamesStream(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
276+
})
234277
} else {
235-
ln, err = q.distributor.LabelNames(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
278+
ln, err = q.labelsWithRetry(ctx, func() ([]string, error) {
279+
return q.distributor.LabelNames(ctx, model.Time(q.mint), model.Time(q.maxt), hints, partialDataEnabled, matchers...)
280+
})
236281
}
237282

238283
if partialdata.IsPartialDataError(err) {
@@ -243,6 +288,33 @@ func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.Labe
243288
return ln, nil, err
244289
}
245290

291+
func (q *distributorQuerier) labelsWithRetry(ctx context.Context, labelsFunc func() ([]string, error)) ([]string, error) {
292+
if q.ingesterQueryMaxAttempts == 1 {
293+
return labelsFunc()
294+
}
295+
296+
var result []string
297+
var err error
298+
299+
retries := backoff.New(ctx, backoff.Config{
300+
MinBackoff: retryMinBackoff,
301+
MaxBackoff: retryMaxBackoff,
302+
MaxRetries: q.ingesterQueryMaxAttempts,
303+
})
304+
305+
for retries.Ongoing() {
306+
result, err = labelsFunc()
307+
308+
if err == nil || !q.isRetryableError(err) {
309+
return result, err
310+
}
311+
312+
retries.Wait()
313+
}
314+
315+
return result, err
316+
}
317+
246318
// labelNamesWithMatchers performs the LabelNames call by calling ingester's MetricsForLabelMatchers method
247319
func (q *distributorQuerier) labelNamesWithMatchers(ctx context.Context, hints *storage.LabelHints, partialDataEnabled bool, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
248320
log, ctx := spanlogger.New(ctx, "distributorQuerier.labelNamesWithMatchers")
@@ -297,6 +369,10 @@ func (q *distributorQuerier) partialDataEnabled(ctx context.Context) bool {
297369
return q.isPartialDataEnabled != nil && q.isPartialDataEnabled(userID)
298370
}
299371

372+
func (q *distributorQuerier) isRetryableError(err error) bool {
373+
return partialdata.IsPartialDataError(err)
374+
}
375+
300376
type distributorExemplarQueryable struct {
301377
distributor Distributor
302378
}

0 commit comments

Comments
 (0)