Skip to content

Commit 69e6a28

Browse files
authored
Add ooo native histogram ingestion (#6626)
Signed-off-by: SungJin1212 <[email protected]>
1 parent ef1ac7c commit 69e6a28

File tree

8 files changed

+314
-0
lines changed

8 files changed

+314
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458
66
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
77
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
8+
* [FEATURE] Ingester: Add a `-ingester.enable-ooo-native-histograms` flag to enable out-of-order native histogram ingestion per tenant. It only takes effect when `-blocks-storage.tsdb.enable-native-histograms=true` and `-ingester.out-of-order-time-window` > 0. It is applied after the restart if it is changed at runtime through the runtime config. #6626
89
* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605
910
* [ENHANCEMENT] Update prometheus version to v3.1.0. #6583
1011
* [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533

docs/configuration/config-file-reference.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3526,6 +3526,13 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
35263526
# CLI flag: -ingester.max-exemplars
35273527
[max_exemplars: <int> | default = 0]
35283528
3529+
# [Experimental] Enable out-of-order native histogram ingestion, it only takes
3530+
# effect when -blocks-storage.tsdb.enable-native-histograms=true and
3531+
# -ingester.out-of-order-time-window > 0. It is applied after the restart if it
3532+
# is changed at runtime through the runtime config.
3533+
# CLI flag: -ingester.enable-ooo-native-histograms
3534+
[enable_ooo_native_histograms: <boolean> | default = false]
3535+
35293536
# Maximum number of chunks that can be fetched in a single query from ingesters
35303537
# and long-term storage. This limit is enforced in the querier, ruler and
35313538
# store-gateway. 0 to disable.

docs/configuration/v1-guarantees.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ Currently experimental features are:
104104
- `-blocks-storage.tsdb.out-of-order-cap-max` (int) CLI flag
105105
- `-ingester.out-of-order-time-window` (duration) CLI flag
106106
- `out_of_order_time_window` (duration) field in runtime config file
107+
- `enable_ooo_native_histograms` (bool) field in runtime config file
107108
- Store Gateway Zone Stable Shuffle Sharding
108109
- `-store-gateway.sharding-ring.zone-stable-shuffle-sharding` CLI flag
109110
- `zone_stable_shuffle_sharding` (boolean) field in config file

integration/native_histogram_test.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"time"
1111

1212
"github.com/prometheus/common/model"
13+
"github.com/prometheus/prometheus/model/labels"
1314
"github.com/prometheus/prometheus/prompb"
1415
"github.com/prometheus/prometheus/tsdb/tsdbutil"
1516
"github.com/stretchr/testify/require"
@@ -19,6 +20,84 @@ import (
1920
"github.com/cortexproject/cortex/integration/e2ecortex"
2021
)
2122

23+
func TestOOONativeHistogramIngestion(t *testing.T) {
24+
s, err := e2e.NewScenario(networkName)
25+
require.NoError(t, err)
26+
defer s.Close()
27+
28+
// Start dependencies.
29+
consul := e2edb.NewConsulWithName("consul")
30+
require.NoError(t, s.StartAndWaitReady(consul))
31+
32+
baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags())
33+
34+
flags := mergeFlags(baseFlags, map[string]string{
35+
// ooo setting
36+
"-ingester.enable-ooo-native-histograms": "true",
37+
"-blocks-storage.tsdb.enable-native-histograms": "true",
38+
"-ingester.out-of-order-time-window": "5m",
39+
// alert manager
40+
"-alertmanager.web.external-url": "http://localhost/alertmanager",
41+
// consul
42+
"-ring.store": "consul",
43+
"-consul.hostname": consul.NetworkHTTPEndpoint(),
44+
})
45+
46+
nowTs := time.Now()
47+
oooTs := time.Now().Add(-time.Minute * 3)
48+
tooOldTs := time.Now().Add(-time.Minute * 10)
49+
50+
// make alert manager config dir
51+
require.NoError(t, writeFileToSharedDir(s, "alertmanager_configs", []byte{}))
52+
53+
minio := e2edb.NewMinio(9000, flags["-blocks-storage.s3.bucket-name"])
54+
require.NoError(t, s.StartAndWaitReady(minio))
55+
56+
cortex := e2ecortex.NewSingleBinary("cortex", flags, "")
57+
require.NoError(t, s.StartAndWaitReady(cortex))
58+
59+
// Wait until Cortex replicas have updated the ring state.
60+
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(float64(512)), "cortex_ring_tokens_total"))
61+
62+
seriesName := "series"
63+
histogramIdx := rand.Uint32()
64+
65+
// Make Cortex client
66+
c, err := e2ecortex.NewClient(cortex.HTTPEndpoint(), cortex.HTTPEndpoint(), "", "", "user-1")
67+
require.NoError(t, err)
68+
69+
// Push now ts
70+
seriesNow := e2e.GenerateHistogramSeries(seriesName, nowTs, histogramIdx, false, prompb.Label{Name: "job", Value: "test"})
71+
res, err := c.Push(seriesNow)
72+
require.NoError(t, err)
73+
require.Equal(t, 200, res.StatusCode)
74+
75+
// Push ooo ts
76+
seriesOOOTs := e2e.GenerateHistogramSeries(seriesName, oooTs, histogramIdx, false, prompb.Label{Name: "job", Value: "test"})
77+
res, err = c.Push(seriesOOOTs)
78+
require.NoError(t, err)
79+
require.Equal(t, 200, res.StatusCode)
80+
81+
// Push too old ts
82+
seriesTooOOOTs := e2e.GenerateHistogramSeries(seriesName, tooOldTs, histogramIdx, false, prompb.Label{Name: "job", Value: "test"})
83+
res, err = c.Push(seriesTooOOOTs)
84+
require.NoError(t, err)
85+
require.Equal(t, 400, res.StatusCode)
86+
87+
// check metrics
88+
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(2), "cortex_ingester_tsdb_head_samples_appended_total"),
89+
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "type", "histogram")),
90+
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")),
91+
)
92+
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(1), "cortex_ingester_tsdb_head_out_of_order_samples_appended_total"),
93+
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "type", "histogram")),
94+
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")),
95+
)
96+
97+
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(2), "cortex_ingester_ingested_native_histograms_total"))
98+
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(1), "cortex_ingester_ingested_native_histograms_failures_total"))
99+
}
100+
22101
func TestNativeHistogramIngestionAndQuery(t *testing.T) {
23102
const blockRangePeriod = 5 * time.Second
24103

pkg/ingester/ingester.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,9 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
12601260
case errors.Is(cause, histogram.ErrHistogramCountMismatch):
12611261
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(timestampMs), lbls) })
12621262

1263+
case errors.Is(cause, storage.ErrOOONativeHistogramsDisabled):
1264+
updateFirstPartial(func() error { return wrappedTSDBIngestErr(err, model.Time(timestampMs), lbls) })
1265+
12631266
default:
12641267
rollback = true
12651268
}
@@ -2421,6 +2424,7 @@ func (i *Ingester) createTSDB(userID string) (*userTSDB, error) {
24212424
EnableMemorySnapshotOnShutdown: i.cfg.BlocksStorageConfig.TSDB.MemorySnapshotOnShutdown,
24222425
OutOfOrderTimeWindow: time.Duration(oooTimeWindow).Milliseconds(),
24232426
OutOfOrderCapMax: i.cfg.BlocksStorageConfig.TSDB.OutOfOrderCapMax,
2427+
EnableOOONativeHistograms: i.limits.EnableOOONativeHistograms(userID),
24242428
EnableOverlappingCompaction: false, // Always let compactors handle overlapped blocks, e.g. OOO blocks.
24252429
EnableNativeHistograms: i.cfg.BlocksStorageConfig.TSDB.EnableNativeHistograms,
24262430
BlockChunkQuerierFunc: i.blockChunkQuerierFunc(userID),

pkg/ingester/ingester_test.go

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,6 +1028,7 @@ func TestIngester_Push(t *testing.T) {
10281028
maxExemplars int
10291029
oooTimeWindow time.Duration
10301030
disableNativeHistogram bool
1031+
enableOOONativeHistograms bool
10311032
}{
10321033
"should record native histogram discarded": {
10331034
reqs: []*cortexpb.WriteRequest{
@@ -1516,6 +1517,186 @@ func TestIngester_Push(t *testing.T) {
15161517
cortex_ingester_active_series{user="test"} 1
15171518
`,
15181519
},
1520+
"native histogram ooo disabled, should soft fail": {
1521+
reqs: []*cortexpb.WriteRequest{
1522+
cortexpb.ToWriteRequest(
1523+
[]labels.Labels{metricLabels},
1524+
nil,
1525+
nil,
1526+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))},
1527+
cortexpb.API),
1528+
cortexpb.ToWriteRequest(
1529+
[]labels.Labels{metricLabels},
1530+
nil,
1531+
nil,
1532+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969-(10), tsdbutil.GenerateTestHistogram(1))},
1533+
cortexpb.API),
1534+
},
1535+
oooTimeWindow: 5 * time.Minute,
1536+
enableOOONativeHistograms: false,
1537+
expectedErr: httpgrpc.Errorf(http.StatusBadRequest, wrapWithUser(wrappedTSDBIngestErr(storage.ErrOOONativeHistogramsDisabled, model.Time(1575043969-(10)), cortexpb.FromLabelsToLabelAdapters(metricLabels)), userID).Error()),
1538+
expectedIngested: []cortexpb.TimeSeries{
1539+
{Labels: metricLabelAdapters, Histograms: []cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))}},
1540+
},
1541+
additionalMetrics: []string{
1542+
"cortex_ingester_tsdb_head_samples_appended_total",
1543+
"cortex_ingester_active_series",
1544+
},
1545+
expectedMetrics: `
1546+
# HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion.
1547+
# TYPE cortex_ingester_ingested_samples_failures_total counter
1548+
cortex_ingester_ingested_samples_failures_total 0
1549+
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested.
1550+
# TYPE cortex_ingester_ingested_samples_total counter
1551+
cortex_ingester_ingested_samples_total 0
1552+
# HELP cortex_ingester_ingested_native_histograms_total The total number of native histograms ingested.
1553+
# TYPE cortex_ingester_ingested_native_histograms_total counter
1554+
cortex_ingester_ingested_native_histograms_total 1
1555+
# HELP cortex_ingester_ingested_native_histograms_failures_total The total number of native histograms that errored on ingestion.
1556+
# TYPE cortex_ingester_ingested_native_histograms_failures_total counter
1557+
cortex_ingester_ingested_native_histograms_failures_total 1
1558+
# HELP cortex_ingester_memory_users The current number of users in memory.
1559+
# TYPE cortex_ingester_memory_users gauge
1560+
cortex_ingester_memory_users 1
1561+
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
1562+
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
1563+
cortex_ingester_tsdb_head_samples_appended_total{type="float",user="test"} 0
1564+
cortex_ingester_tsdb_head_samples_appended_total{type="histogram",user="test"} 1
1565+
# HELP cortex_ingester_memory_series The current number of series in memory.
1566+
# TYPE cortex_ingester_memory_series gauge
1567+
cortex_ingester_memory_series 1
1568+
# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
1569+
# TYPE cortex_ingester_memory_series_created_total counter
1570+
cortex_ingester_memory_series_created_total{user="test"} 1
1571+
# HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user.
1572+
# TYPE cortex_ingester_memory_series_removed_total counter
1573+
cortex_ingester_memory_series_removed_total{user="test"} 0
1574+
# HELP cortex_ingester_active_series Number of currently active series per user.
1575+
# TYPE cortex_ingester_active_series gauge
1576+
cortex_ingester_active_series{user="test"} 1
1577+
`,
1578+
},
1579+
"native histogram ooo enabled, should soft fail on sample too old": {
1580+
reqs: []*cortexpb.WriteRequest{
1581+
cortexpb.ToWriteRequest(
1582+
[]labels.Labels{metricLabels},
1583+
nil,
1584+
nil,
1585+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))},
1586+
cortexpb.API),
1587+
cortexpb.ToWriteRequest(
1588+
[]labels.Labels{metricLabels},
1589+
nil,
1590+
nil,
1591+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969-(600*1000), tsdbutil.GenerateTestHistogram(1))},
1592+
cortexpb.API),
1593+
},
1594+
oooTimeWindow: 5 * time.Minute,
1595+
enableOOONativeHistograms: true,
1596+
expectedErr: httpgrpc.Errorf(http.StatusBadRequest, wrapWithUser(wrappedTSDBIngestErr(storage.ErrTooOldSample, model.Time(1575043969-(600*1000)), cortexpb.FromLabelsToLabelAdapters(metricLabels)), userID).Error()),
1597+
expectedIngested: []cortexpb.TimeSeries{
1598+
{Labels: metricLabelAdapters, Histograms: []cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))}},
1599+
},
1600+
additionalMetrics: []string{
1601+
"cortex_ingester_tsdb_head_samples_appended_total",
1602+
"cortex_ingester_active_series",
1603+
"cortex_discarded_samples_total",
1604+
},
1605+
expectedMetrics: `
1606+
# HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion.
1607+
# TYPE cortex_ingester_ingested_samples_failures_total counter
1608+
cortex_ingester_ingested_samples_failures_total 0
1609+
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested.
1610+
# TYPE cortex_ingester_ingested_samples_total counter
1611+
cortex_ingester_ingested_samples_total 0
1612+
# HELP cortex_ingester_ingested_native_histograms_total The total number of native histograms ingested.
1613+
# TYPE cortex_ingester_ingested_native_histograms_total counter
1614+
cortex_ingester_ingested_native_histograms_total 1
1615+
# HELP cortex_ingester_ingested_native_histograms_failures_total The total number of native histograms that errored on ingestion.
1616+
# TYPE cortex_ingester_ingested_native_histograms_failures_total counter
1617+
cortex_ingester_ingested_native_histograms_failures_total 1
1618+
# HELP cortex_ingester_memory_users The current number of users in memory.
1619+
# TYPE cortex_ingester_memory_users gauge
1620+
cortex_ingester_memory_users 1
1621+
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
1622+
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
1623+
cortex_ingester_tsdb_head_samples_appended_total{type="float",user="test"} 0
1624+
cortex_ingester_tsdb_head_samples_appended_total{type="histogram",user="test"} 1
1625+
# HELP cortex_ingester_memory_series The current number of series in memory.
1626+
# TYPE cortex_ingester_memory_series gauge
1627+
cortex_ingester_memory_series 1
1628+
# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
1629+
# TYPE cortex_ingester_memory_series_created_total counter
1630+
cortex_ingester_memory_series_created_total{user="test"} 1
1631+
# HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user.
1632+
# TYPE cortex_ingester_memory_series_removed_total counter
1633+
cortex_ingester_memory_series_removed_total{user="test"} 0
1634+
# HELP cortex_ingester_active_series Number of currently active series per user.
1635+
# TYPE cortex_ingester_active_series gauge
1636+
cortex_ingester_active_series{user="test"} 1
1637+
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
1638+
# TYPE cortex_discarded_samples_total counter
1639+
cortex_discarded_samples_total{reason="sample-too-old",user="test"} 1
1640+
`,
1641+
},
1642+
"native histogram ooo enabled, should succeed": {
1643+
reqs: []*cortexpb.WriteRequest{
1644+
cortexpb.ToWriteRequest(
1645+
[]labels.Labels{metricLabels},
1646+
nil,
1647+
nil,
1648+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))},
1649+
cortexpb.API),
1650+
cortexpb.ToWriteRequest(
1651+
[]labels.Labels{metricLabels},
1652+
nil,
1653+
nil,
1654+
[]cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969-(10), tsdbutil.GenerateTestHistogram(1))},
1655+
cortexpb.API),
1656+
},
1657+
oooTimeWindow: 5 * time.Minute,
1658+
enableOOONativeHistograms: true,
1659+
expectedIngested: []cortexpb.TimeSeries{
1660+
{Labels: metricLabelAdapters, Histograms: []cortexpb.Histogram{cortexpb.HistogramToHistogramProto(1575043969-(10), tsdbutil.GenerateTestHistogram(1)), cortexpb.HistogramToHistogramProto(1575043969, tsdbutil.GenerateTestHistogram(1))}},
1661+
},
1662+
additionalMetrics: []string{
1663+
"cortex_ingester_tsdb_head_samples_appended_total",
1664+
"cortex_ingester_active_series",
1665+
},
1666+
expectedMetrics: `
1667+
# HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion.
1668+
# TYPE cortex_ingester_ingested_samples_failures_total counter
1669+
cortex_ingester_ingested_samples_failures_total 0
1670+
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested.
1671+
# TYPE cortex_ingester_ingested_samples_total counter
1672+
cortex_ingester_ingested_samples_total 0
1673+
# HELP cortex_ingester_ingested_native_histograms_total The total number of native histograms ingested.
1674+
# TYPE cortex_ingester_ingested_native_histograms_total counter
1675+
cortex_ingester_ingested_native_histograms_total 2
1676+
# HELP cortex_ingester_ingested_native_histograms_failures_total The total number of native histograms that errored on ingestion.
1677+
# TYPE cortex_ingester_ingested_native_histograms_failures_total counter
1678+
cortex_ingester_ingested_native_histograms_failures_total 0
1679+
# HELP cortex_ingester_memory_users The current number of users in memory.
1680+
# TYPE cortex_ingester_memory_users gauge
1681+
cortex_ingester_memory_users 1
1682+
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
1683+
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
1684+
cortex_ingester_tsdb_head_samples_appended_total{type="float",user="test"} 0
1685+
cortex_ingester_tsdb_head_samples_appended_total{type="histogram",user="test"} 2
1686+
# HELP cortex_ingester_memory_series The current number of series in memory.
1687+
# TYPE cortex_ingester_memory_series gauge
1688+
cortex_ingester_memory_series 1
1689+
# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
1690+
# TYPE cortex_ingester_memory_series_created_total counter
1691+
cortex_ingester_memory_series_created_total{user="test"} 1
1692+
# HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user.
1693+
# TYPE cortex_ingester_memory_series_removed_total counter
1694+
cortex_ingester_memory_series_removed_total{user="test"} 0
1695+
# HELP cortex_ingester_active_series Number of currently active series per user.
1696+
# TYPE cortex_ingester_active_series gauge
1697+
cortex_ingester_active_series{user="test"} 1
1698+
`,
1699+
},
15191700
"should soft fail on two different sample values at the same timestamp": {
15201701
reqs: []*cortexpb.WriteRequest{
15211702
cortexpb.ToWriteRequest(
@@ -1824,6 +2005,7 @@ func TestIngester_Push(t *testing.T) {
18242005
limits := defaultLimitsTestConfig()
18252006
limits.MaxExemplars = testData.maxExemplars
18262007
limits.OutOfOrderTimeWindow = model.Duration(testData.oooTimeWindow)
2008+
limits.EnableOOONativeHistograms = testData.enableOOONativeHistograms
18272009
limits.LimitsPerLabelSet = []validation.LimitsPerLabelSet{
18282010
{
18292011
LabelSet: labels.FromMap(map[string]string{model.MetricNameLabel: "test"}),

0 commit comments

Comments
 (0)