Skip to content

Commit 7292d8d

Browse files
authored
retrieve peakSamples and processedSamples query stats from results_cache as well (#6591)
Signed-off-by: Erlan Zholdubai uulu <[email protected]>
1 parent a0e7e35 commit 7292d8d

File tree

6 files changed

+152
-18
lines changed

6 files changed

+152
-18
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* [BUGFIX] Compactor: Cleaner should not put deletion marker for blocks with no-compact marker. #6576
1313
* [BUGFIX] Compactor: Cleaner would delete bucket index when there is no block in bucket store. #6577
1414
* [BUGFIX] Querier: Fix marshal native histogram with empty bucket when protobuf codec is enabled. #6595
15+
* [BUGFIX] Query Frontend: Fix samples scanned and peak samples query stats when query hits results cache. #6591
1516

1617
## 1.19.0 in progress
1718

Diff for: integration/query_frontend_test.go

+93
Original file line numberDiff line numberDiff line change
@@ -859,3 +859,96 @@ func TestQueryFrontendQueryRejection(t *testing.T) {
859859
require.Contains(t, string(body), tripperware.QueryRejectErrorMessage)
860860

861861
}
862+
863+
func TestQueryFrontendStatsFromResultsCacheShouldBeSame(t *testing.T) {
864+
865+
s, err := e2e.NewScenario(networkName)
866+
require.NoError(t, err)
867+
defer s.Close()
868+
869+
memcached := e2ecache.NewMemcached()
870+
consul := e2edb.NewConsul()
871+
require.NoError(t, s.StartAndWaitReady(consul, memcached))
872+
873+
flags := mergeFlags(BlocksStorageFlags(), map[string]string{
874+
"-querier.cache-results": "true",
875+
"-querier.split-queries-by-interval": "24h",
876+
"-querier.query-ingesters-within": "12h", // Required by the test on query /series out of ingesters time range
877+
"-querier.per-step-stats-enabled": strconv.FormatBool(true),
878+
"-frontend.memcached.addresses": "dns+" + memcached.NetworkEndpoint(e2ecache.MemcachedPort),
879+
"-frontend.query-stats-enabled": strconv.FormatBool(true),
880+
"-frontend.cache-queryable-samples-stats": strconv.FormatBool(true),
881+
})
882+
883+
minio := e2edb.NewMinio(9000, flags["-blocks-storage.s3.bucket-name"])
884+
require.NoError(t, s.StartAndWaitReady(minio))
885+
886+
// Start the query-scheduler
887+
queryScheduler := e2ecortex.NewQueryScheduler("query-scheduler", flags, "")
888+
require.NoError(t, s.StartAndWaitReady(queryScheduler))
889+
flags["-frontend.scheduler-address"] = queryScheduler.NetworkGRPCEndpoint()
890+
flags["-querier.scheduler-address"] = queryScheduler.NetworkGRPCEndpoint()
891+
892+
// Start the query-frontend.
893+
queryFrontend := e2ecortex.NewQueryFrontendWithConfigFile("query-frontend", "", flags, "")
894+
require.NoError(t, s.Start(queryFrontend))
895+
896+
// Start all other services.
897+
ingester := e2ecortex.NewIngesterWithConfigFile("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), "", flags, "")
898+
distributor := e2ecortex.NewDistributorWithConfigFile("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), "", flags, "")
899+
900+
querier := e2ecortex.NewQuerierWithConfigFile("querier", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), "", flags, "")
901+
902+
require.NoError(t, s.StartAndWaitReady(querier, ingester, distributor))
903+
require.NoError(t, s.WaitReady(queryFrontend))
904+
905+
// Check if we're discovering memcache or not.
906+
require.NoError(t, queryFrontend.WaitSumMetrics(e2e.Equals(1), "cortex_memcache_client_servers"))
907+
require.NoError(t, queryFrontend.WaitSumMetrics(e2e.Greater(0), "cortex_dns_lookups_total"))
908+
909+
// Wait until both the distributor and querier have updated the ring.
910+
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
911+
require.NoError(t, querier.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
912+
913+
// Push some series to Cortex.
914+
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", "", "user-1")
915+
require.NoError(t, err)
916+
917+
seriesTimestamp := time.Now().Add(-10 * time.Minute)
918+
series2Timestamp := seriesTimestamp.Add(1 * time.Minute)
919+
series1, _ := generateSeries("series_1", seriesTimestamp, prompb.Label{Name: "job", Value: "test"})
920+
series2, _ := generateSeries("series_2", series2Timestamp, prompb.Label{Name: "job", Value: "test"})
921+
922+
res, err := c.Push(series1)
923+
require.NoError(t, err)
924+
require.Equal(t, 200, res.StatusCode)
925+
926+
res, err = c.Push(series2)
927+
require.NoError(t, err)
928+
require.Equal(t, 200, res.StatusCode)
929+
930+
// Query back the series.
931+
c, err = e2ecortex.NewClient("", queryFrontend.HTTPEndpoint(), "", "", "user-1")
932+
require.NoError(t, err)
933+
934+
// First request that will hit the datasource.
935+
resp, _, err := c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-1*time.Minute), series2Timestamp.Add(1*time.Minute), 30*time.Second, map[string]string{})
936+
require.NoError(t, err)
937+
require.Equal(t, http.StatusOK, resp.StatusCode)
938+
939+
values, err := queryFrontend.SumMetrics([]string{"cortex_query_samples_scanned_total"})
940+
require.NoError(t, err)
941+
numSamplesScannedTotal := e2e.SumValues(values)
942+
943+
// We send the same query to hit the results cache.
944+
resp, _, err = c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-1*time.Minute), series2Timestamp.Add(1*time.Minute), 30*time.Second, map[string]string{})
945+
require.NoError(t, err)
946+
require.Equal(t, http.StatusOK, resp.StatusCode)
947+
948+
values, err = queryFrontend.SumMetrics([]string{"cortex_query_samples_scanned_total"})
949+
require.NoError(t, err)
950+
numSamplesScannedTotal2 := e2e.SumValues(values)
951+
952+
// we expect same amount of samples_scanned added to the metric despite the second query hit the cache.
953+
require.Equal(t, numSamplesScannedTotal2, numSamplesScannedTotal*2)
954+
}

Diff for: pkg/frontend/transport/handler.go

+1
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ func (f *Handler) reportQueryStats(r *http.Request, source, userID string, query
410410
"split_queries", splitQueries,
411411
"status_code", statusCode,
412412
"response_size", contentLength,
413+
"samples_scanned", numScannedSamples,
413414
}, stats.LoadExtraFields()...)
414415

415416
if numStoreGatewayTouchedPostings > 0 {

Diff for: pkg/frontend/transport/handler_test.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -434,11 +434,11 @@ func TestReportQueryStatsFormat(t *testing.T) {
434434

435435
tests := map[string]testCase{
436436
"should not include query and header details if empty": {
437-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000`,
437+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0`,
438438
},
439439
"should include query length and string at the end": {
440440
queryString: url.Values(map[string][]string{"query": {"up"}}),
441-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 query_length=2 param_query=up`,
441+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0 query_length=2 param_query=up`,
442442
},
443443
"should include query stats": {
444444
queryStats: &querier_stats.QueryStats{
@@ -454,31 +454,31 @@ func TestReportQueryStatsFormat(t *testing.T) {
454454
SplitQueries: 10,
455455
},
456456
},
457-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=3 response_series_count=100 fetched_series_count=100 fetched_chunks_count=200 fetched_samples_count=300 fetched_chunks_bytes=1024 fetched_data_bytes=2048 split_queries=10 status_code=200 response_size=1000 query_storage_wall_time_seconds=6000`,
457+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=3 response_series_count=100 fetched_series_count=100 fetched_chunks_count=200 fetched_samples_count=300 fetched_chunks_bytes=1024 fetched_data_bytes=2048 split_queries=10 status_code=200 response_size=1000 samples_scanned=0 query_storage_wall_time_seconds=6000`,
458458
},
459459
"should include user agent": {
460460
header: http.Header{"User-Agent": []string{"Grafana"}},
461-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 user_agent=Grafana`,
461+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0 user_agent=Grafana`,
462462
},
463463
"should include response error": {
464464
responseErr: errors.New("foo_err"),
465-
expectedLog: `level=error msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 error=foo_err`,
465+
expectedLog: `level=error msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0 error=foo_err`,
466466
},
467467
"should include query priority": {
468468
queryString: url.Values(map[string][]string{"query": {"up"}}),
469469
queryStats: &querier_stats.QueryStats{
470470
Priority: 99,
471471
PriorityAssigned: true,
472472
},
473-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 query_length=2 priority=99 param_query=up`,
473+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0 query_length=2 priority=99 param_query=up`,
474474
},
475475
"should include data fetch min and max time": {
476476
queryString: url.Values(map[string][]string{"query": {"up"}}),
477477
queryStats: &querier_stats.QueryStats{
478478
DataSelectMaxTime: 1704153600000,
479479
DataSelectMinTime: 1704067200000,
480480
},
481-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 data_select_max_time=1704153600 data_select_min_time=1704067200 query_length=2 param_query=up`,
481+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=0 response_series_count=0 fetched_series_count=0 fetched_chunks_count=0 fetched_samples_count=0 fetched_chunks_bytes=0 fetched_data_bytes=0 split_queries=0 status_code=200 response_size=1000 samples_scanned=0 data_select_max_time=1704153600 data_select_min_time=1704067200 query_length=2 param_query=up`,
482482
},
483483
"should include query stats with store gateway stats": {
484484
queryStats: &querier_stats.QueryStats{
@@ -496,7 +496,7 @@ func TestReportQueryStatsFormat(t *testing.T) {
496496
StoreGatewayTouchedPostingBytes: 200,
497497
},
498498
},
499-
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=3 response_series_count=100 fetched_series_count=100 fetched_chunks_count=200 fetched_samples_count=300 fetched_chunks_bytes=1024 fetched_data_bytes=2048 split_queries=10 status_code=200 response_size=1000 store_gateway_touched_postings_count=20 store_gateway_touched_posting_bytes=200 query_storage_wall_time_seconds=6000`,
499+
expectedLog: `level=info msg="query stats" component=query-frontend method=GET path=/prometheus/api/v1/query response_time=1s query_wall_time_seconds=3 response_series_count=100 fetched_series_count=100 fetched_chunks_count=200 fetched_samples_count=300 fetched_chunks_bytes=1024 fetched_data_bytes=2048 split_queries=10 status_code=200 response_size=1000 samples_scanned=0 store_gateway_touched_postings_count=20 store_gateway_touched_posting_bytes=200 query_storage_wall_time_seconds=6000`,
500500
},
501501
}
502502

Diff for: pkg/querier/tripperware/queryrange/results_cache.go

+11-3
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ func (s resultsCache) handleHit(ctx context.Context, r tripperware.Request, exte
450450

451451
level.Debug(util_log.WithContext(ctx, log)).Log("msg", "handle hit", "start", r.GetStart(), "spanID", jaegerSpanID(ctx))
452452

453-
requests, responses, err := s.partition(r, extents)
453+
requests, responses, err := s.partition(ctx, r, extents)
454454
if err != nil {
455455
return nil, nil, err
456456
}
@@ -647,7 +647,7 @@ func convertFromTripperwarePrometheusResponse(resp tripperware.Response) tripper
647647

648648
// partition calculates the required requests to satisfy req given the cached data.
649649
// extents must be in order by start time.
650-
func (s resultsCache) partition(req tripperware.Request, extents []tripperware.Extent) ([]tripperware.Request, []tripperware.Response, error) {
650+
func (s resultsCache) partition(ctx context.Context, req tripperware.Request, extents []tripperware.Extent) ([]tripperware.Request, []tripperware.Response, error) {
651651
var requests []tripperware.Request
652652
var cachedResponses []tripperware.Response
653653
start := req.GetStart()
@@ -678,7 +678,14 @@ func (s resultsCache) partition(req tripperware.Request, extents []tripperware.E
678678
return nil, nil, err
679679
}
680680
// extract the overlap from the cached extent.
681-
cachedResponses = append(cachedResponses, s.extractor.Extract(start, req.GetEnd(), res))
681+
promRes := s.extractor.Extract(start, req.GetEnd(), res).(*tripperware.PrometheusResponse)
682+
cachedResponses = append(cachedResponses, promRes)
683+
684+
if queryStats := querier_stats.FromContext(ctx); queryStats != nil && promRes.Data.Stats != nil {
685+
queryStats.AddScannedSamples(uint64(promRes.Data.Stats.Samples.TotalQueryableSamples))
686+
queryStats.SetPeakSamples(max(queryStats.LoadPeakSamples(), uint64(promRes.Data.Stats.Samples.PeakSamples)))
687+
}
688+
682689
start = extent.End
683690
}
684691

@@ -807,6 +814,7 @@ func extractStats(start, end int64, stats *tripperware.PrometheusResponseStats)
807814
if start <= s.TimestampMs && s.TimestampMs <= end {
808815
result.Samples.TotalQueryableSamplesPerStep = append(result.Samples.TotalQueryableSamplesPerStep, s)
809816
result.Samples.TotalQueryableSamples += s.Value
817+
result.Samples.PeakSamples = max(result.Samples.PeakSamples, s.Value)
810818
}
811819
}
812820
return result

0 commit comments

Comments
 (0)