Skip to content

Commit e6e9fea

Browse files
authored
Create a experimental HealthCheck GRPC Handler (#6225)
* Implementing HealthCheck grpc handlers Signed-off-by: alanprot <[email protected]> * tests Signed-off-by: alanprot <[email protected]> * lint Signed-off-by: alanprot <[email protected]> * Adding tests Signed-off-by: alanprot <[email protected]> * Name to the hc service + changelog Signed-off-by: alanprot <[email protected]> --------- Signed-off-by: alanprot <[email protected]>
1 parent f971a60 commit e6e9fea

File tree

10 files changed

+455
-19
lines changed

10 files changed

+455
-19
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* [ENHANCEMENT] Distributor: Add new `cortex_reduced_resolution_histogram_samples_total` metric to track the number of histogram samples which resolution was reduced. #6182
1818
* [ENHANCEMENT] StoreGateway: Implement metadata API limit in queryable. #6195
1919
* [ENHANCEMENT] Ingester: Add matchers to ingester LabelNames() and LabelNamesStream() RPC. #6209
20+
* [ENHANCEMENT] Ingester/Store Gateway Clients: Introduce an experimental HealthCheck handler to quickly fail requests directed to unhealthy targets. #6225
2021
* [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224
2122

2223
## 1.18.0 2024-09-03

docs/blocks-storage/querier.md

+18
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,24 @@ querier:
204204
# CLI flag: -querier.store-gateway-client.grpc-compression
205205
[grpc_compression: <string> | default = ""]
206206

207+
# EXPERIMENTAL: If enabled, gRPC clients perform health checks for each
208+
# target and fail the request if the target is marked as unhealthy.
209+
healthcheck_config:
210+
# The number of consecutive failed health checks required before
211+
# considering a target unhealthy. 0 means disabled.
212+
# CLI flag: -querier.store-gateway-client.unhealthy-threshold
213+
[unhealthy_threshold: <int> | default = 0]
214+
215+
# The approximate amount of time between health checks of an individual
216+
# target.
217+
# CLI flag: -querier.store-gateway-client.interval
218+
[interval: <duration> | default = 5s]
219+
220+
# The amount of time during which no response from a target means a failed
221+
# health check.
222+
# CLI flag: -querier.store-gateway-client.timeout
223+
[timeout: <duration> | default = 1s]
224+
207225
# If enabled, store gateway query stats will be logged using `info` log level.
208226
# CLI flag: -querier.store-gateway-query-stats-enabled
209227
[store_gateway_query_stats: <boolean> | default = true]

docs/configuration/config-file-reference.md

+36
Original file line numberDiff line numberDiff line change
@@ -3099,6 +3099,24 @@ grpc_client_config:
30993099
# CLI flag: -ingester.client.tls-insecure-skip-verify
31003100
[tls_insecure_skip_verify: <boolean> | default = false]
31013101
3102+
# EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target
3103+
# and fail the request if the target is marked as unhealthy.
3104+
healthcheck_config:
3105+
# The number of consecutive failed health checks required before considering
3106+
# a target unhealthy. 0 means disabled.
3107+
# CLI flag: -ingester.client.unhealthy-threshold
3108+
[unhealthy_threshold: <int> | default = 0]
3109+
3110+
# The approximate amount of time between health checks of an individual
3111+
# target.
3112+
# CLI flag: -ingester.client.interval
3113+
[interval: <duration> | default = 5s]
3114+
3115+
# The amount of time during which no response from a target means a failed
3116+
# health check.
3117+
# CLI flag: -ingester.client.timeout
3118+
[timeout: <duration> | default = 1s]
3119+
31023120
# Max inflight push requests that this ingester client can handle. This limit is
31033121
# per-ingester-client. Additional requests will be rejected. 0 = unlimited.
31043122
# CLI flag: -ingester.client.max-inflight-push-requests
@@ -3815,6 +3833,24 @@ store_gateway_client:
38153833
# CLI flag: -querier.store-gateway-client.grpc-compression
38163834
[grpc_compression: <string> | default = ""]
38173835
3836+
# EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target
3837+
# and fail the request if the target is marked as unhealthy.
3838+
healthcheck_config:
3839+
# The number of consecutive failed health checks required before considering
3840+
# a target unhealthy. 0 means disabled.
3841+
# CLI flag: -querier.store-gateway-client.unhealthy-threshold
3842+
[unhealthy_threshold: <int> | default = 0]
3843+
3844+
# The approximate amount of time between health checks of an individual
3845+
# target.
3846+
# CLI flag: -querier.store-gateway-client.interval
3847+
[interval: <duration> | default = 5s]
3848+
3849+
# The amount of time during which no response from a target means a failed
3850+
# health check.
3851+
# CLI flag: -querier.store-gateway-client.timeout
3852+
[timeout: <duration> | default = 1s]
3853+
38183854
# If enabled, store gateway query stats will be logged using `info` log level.
38193855
# CLI flag: -querier.store-gateway-query-stats-enabled
38203856
[store_gateway_query_stats: <boolean> | default = true]

pkg/cortex/modules.go

+18-2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import (
4848
"github.com/cortexproject/cortex/pkg/scheduler"
4949
"github.com/cortexproject/cortex/pkg/storage/bucket"
5050
"github.com/cortexproject/cortex/pkg/storegateway"
51+
"github.com/cortexproject/cortex/pkg/util/grpcclient"
5152
util_log "github.com/cortexproject/cortex/pkg/util/log"
5253
"github.com/cortexproject/cortex/pkg/util/modules"
5354
"github.com/cortexproject/cortex/pkg/util/runtimeconfig"
@@ -65,6 +66,7 @@ const (
6566
Server string = "server"
6667
Distributor string = "distributor"
6768
DistributorService string = "distributor-service"
69+
GrpcClientService string = "grpcclient-service"
6870
Ingester string = "ingester"
6971
IngesterService string = "ingester-service"
7072
Flusher string = "flusher"
@@ -230,6 +232,19 @@ func (t *Cortex) initDistributorService() (serv services.Service, err error) {
230232
return t.Distributor, nil
231233
}
232234

235+
func (t *Cortex) initGrpcClientServices() (serv services.Service, err error) {
236+
s := grpcclient.NewHealthCheckInterceptors(util_log.Logger)
237+
if t.Cfg.IngesterClient.GRPCClientConfig.HealthCheckConfig.UnhealthyThreshold > 0 {
238+
t.Cfg.IngesterClient.GRPCClientConfig.HealthCheckConfig.HealthCheckInterceptors = s
239+
}
240+
241+
if t.Cfg.Querier.StoreGatewayClient.HealthCheckConfig.UnhealthyThreshold > 0 {
242+
t.Cfg.Querier.StoreGatewayClient.HealthCheckConfig.HealthCheckInterceptors = s
243+
}
244+
245+
return s, nil
246+
}
247+
233248
func (t *Cortex) initDistributor() (serv services.Service, err error) {
234249
t.API.RegisterDistributor(t.Distributor, t.Cfg.Distributor)
235250

@@ -754,6 +769,7 @@ func (t *Cortex) setupModuleManager() error {
754769
mm.RegisterModule(OverridesExporter, t.initOverridesExporter)
755770
mm.RegisterModule(Distributor, t.initDistributor)
756771
mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule)
772+
mm.RegisterModule(GrpcClientService, t.initGrpcClientServices, modules.UserInvisibleModule)
757773
mm.RegisterModule(Ingester, t.initIngester)
758774
mm.RegisterModule(IngesterService, t.initIngesterService, modules.UserInvisibleModule)
759775
mm.RegisterModule(Flusher, t.initFlusher)
@@ -782,14 +798,14 @@ func (t *Cortex) setupModuleManager() error {
782798
Ring: {API, RuntimeConfig, MemberlistKV},
783799
Overrides: {RuntimeConfig},
784800
OverridesExporter: {RuntimeConfig},
785-
Distributor: {DistributorService, API},
801+
Distributor: {DistributorService, API, GrpcClientService},
786802
DistributorService: {Ring, Overrides},
787803
Ingester: {IngesterService, Overrides, API},
788804
IngesterService: {Overrides, RuntimeConfig, MemberlistKV},
789805
Flusher: {Overrides, API},
790806
Queryable: {Overrides, DistributorService, Overrides, Ring, API, StoreQueryable, MemberlistKV},
791807
Querier: {TenantFederation},
792-
StoreQueryable: {Overrides, Overrides, MemberlistKV},
808+
StoreQueryable: {Overrides, Overrides, MemberlistKV, GrpcClientService},
793809
QueryFrontendTripperware: {API, Overrides},
794810
QueryFrontend: {QueryFrontendTripperware},
795811
QueryScheduler: {API, Overrides},

pkg/ingester/client/client.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ func (c *closableHealthAndIngesterClient) Close() error {
111111

112112
// Config is the configuration struct for the ingester client
113113
type Config struct {
114-
GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"`
115-
MaxInflightPushRequests int64 `yaml:"max_inflight_push_requests"`
114+
GRPCClientConfig grpcclient.ConfigWithHealthCheck `yaml:"grpc_client_config"`
115+
MaxInflightPushRequests int64 `yaml:"max_inflight_push_requests"`
116116
}
117117

118118
// RegisterFlags registers configuration settings used by the ingester client config.

pkg/querier/store_gateway_client.go

+19-14
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
"github.com/cortexproject/cortex/pkg/util/tls"
1818
)
1919

20-
func newStoreGatewayClientFactory(clientCfg grpcclient.Config, reg prometheus.Registerer) client.PoolFactory {
20+
func newStoreGatewayClientFactory(clientCfg grpcclient.ConfigWithHealthCheck, reg prometheus.Registerer) client.PoolFactory {
2121
requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
2222
Namespace: "cortex",
2323
Name: "storegateway_client_request_duration_seconds",
@@ -31,7 +31,7 @@ func newStoreGatewayClientFactory(clientCfg grpcclient.Config, reg prometheus.Re
3131
}
3232
}
3333

34-
func dialStoreGatewayClient(clientCfg grpcclient.Config, addr string, requestDuration *prometheus.HistogramVec) (*storeGatewayClient, error) {
34+
func dialStoreGatewayClient(clientCfg grpcclient.ConfigWithHealthCheck, addr string, requestDuration *prometheus.HistogramVec) (*storeGatewayClient, error) {
3535
opts, err := clientCfg.DialOption(grpcclient.Instrument(requestDuration))
3636
if err != nil {
3737
return nil, err
@@ -69,15 +69,18 @@ func (c *storeGatewayClient) RemoteAddress() string {
6969

7070
func newStoreGatewayClientPool(discovery client.PoolServiceDiscovery, clientConfig ClientConfig, logger log.Logger, reg prometheus.Registerer) *client.Pool {
7171
// We prefer sane defaults instead of exposing further config options.
72-
clientCfg := grpcclient.Config{
73-
MaxRecvMsgSize: 100 << 20,
74-
MaxSendMsgSize: 16 << 20,
75-
GRPCCompression: clientConfig.GRPCCompression,
76-
RateLimit: 0,
77-
RateLimitBurst: 0,
78-
BackoffOnRatelimits: false,
79-
TLSEnabled: clientConfig.TLSEnabled,
80-
TLS: clientConfig.TLS,
72+
clientCfg := grpcclient.ConfigWithHealthCheck{
73+
Config: grpcclient.Config{
74+
MaxRecvMsgSize: 100 << 20,
75+
MaxSendMsgSize: 16 << 20,
76+
GRPCCompression: clientConfig.GRPCCompression,
77+
RateLimit: 0,
78+
RateLimitBurst: 0,
79+
BackoffOnRatelimits: false,
80+
TLSEnabled: clientConfig.TLSEnabled,
81+
TLS: clientConfig.TLS,
82+
},
83+
HealthCheckConfig: clientConfig.HealthCheckConfig,
8184
}
8285
poolCfg := client.PoolConfig{
8386
CheckInterval: time.Minute,
@@ -96,13 +99,15 @@ func newStoreGatewayClientPool(discovery client.PoolServiceDiscovery, clientConf
9699
}
97100

98101
type ClientConfig struct {
99-
TLSEnabled bool `yaml:"tls_enabled"`
100-
TLS tls.ClientConfig `yaml:",inline"`
101-
GRPCCompression string `yaml:"grpc_compression"`
102+
TLSEnabled bool `yaml:"tls_enabled"`
103+
TLS tls.ClientConfig `yaml:",inline"`
104+
GRPCCompression string `yaml:"grpc_compression"`
105+
HealthCheckConfig grpcclient.HealthCheckConfig `yaml:"healthcheck_config" doc:"description=EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target and fail the request if the target is marked as unhealthy."`
102106
}
103107

104108
func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
105109
f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS for gRPC client connecting to store-gateway.")
106110
f.StringVar(&cfg.GRPCCompression, prefix+".grpc-compression", "", "Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)")
107111
cfg.TLS.RegisterFlagsWithPrefix(prefix, f)
112+
cfg.HealthCheckConfig.RegisterFlagsWithPrefix(prefix, f)
108113
}

pkg/querier/store_gateway_client_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func Test_newStoreGatewayClientFactory(t *testing.T) {
3636

3737
// Create a client factory and query back the mocked service
3838
// with different clients.
39-
cfg := grpcclient.Config{}
39+
cfg := grpcclient.ConfigWithHealthCheck{}
4040
flagext.DefaultValues(&cfg)
4141

4242
reg := prometheus.NewPedanticRegistry()

pkg/util/grpcclient/grpcclient.go

+19
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,21 @@ type Config struct {
3434
SignWriteRequestsEnabled bool `yaml:"-"`
3535
}
3636

37+
type ConfigWithHealthCheck struct {
38+
Config `yaml:",inline"`
39+
HealthCheckConfig HealthCheckConfig `yaml:"healthcheck_config" doc:"description=EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target and fail the request if the target is marked as unhealthy."`
40+
}
41+
3742
// RegisterFlags registers flags.
3843
func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
3944
cfg.RegisterFlagsWithPrefix("", "", f)
4045
}
4146

47+
func (cfg *ConfigWithHealthCheck) RegisterFlagsWithPrefix(prefix, defaultGrpcCompression string, f *flag.FlagSet) {
48+
cfg.Config.RegisterFlagsWithPrefix(prefix, defaultGrpcCompression, f)
49+
cfg.HealthCheckConfig.RegisterFlagsWithPrefix(prefix, f)
50+
}
51+
4252
// RegisterFlagsWithPrefix registers flags with prefix.
4353
func (cfg *Config) RegisterFlagsWithPrefix(prefix, defaultGrpcCompression string, f *flag.FlagSet) {
4454
f.IntVar(&cfg.MaxRecvMsgSize, prefix+".grpc-max-recv-msg-size", 100<<20, "gRPC client max receive message size (bytes).")
@@ -75,6 +85,15 @@ func (cfg *Config) CallOptions() []grpc.CallOption {
7585
return opts
7686
}
7787

88+
func (cfg *ConfigWithHealthCheck) DialOption(unaryClientInterceptors []grpc.UnaryClientInterceptor, streamClientInterceptors []grpc.StreamClientInterceptor) ([]grpc.DialOption, error) {
89+
if cfg.HealthCheckConfig.HealthCheckInterceptors != nil {
90+
unaryClientInterceptors = append(unaryClientInterceptors, cfg.HealthCheckConfig.UnaryHealthCheckInterceptor(cfg))
91+
streamClientInterceptors = append(streamClientInterceptors, cfg.HealthCheckConfig.StreamClientInterceptor(cfg))
92+
}
93+
94+
return cfg.Config.DialOption(unaryClientInterceptors, streamClientInterceptors)
95+
}
96+
7897
// DialOption returns the config as a grpc.DialOptions.
7998
func (cfg *Config) DialOption(unaryClientInterceptors []grpc.UnaryClientInterceptor, streamClientInterceptors []grpc.StreamClientInterceptor) ([]grpc.DialOption, error) {
8099
var opts []grpc.DialOption

0 commit comments

Comments
 (0)