cortexproject · yeya24 · May 15, 2025 · Jan 15, 2025 · Feb 7, 2025 · Feb 21, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 * [FEATURE] Ruler: Add support for percentage based sharding for rulers. #6680
 * [FEATURE] Ruler: Add support for group labels. #6665
 * [FEATURE] Support Parquet format: Implement parquet converter service to convert a TSDB block into Parquet. #6716
+* [FEATURE] Distributor/Ingester: Implemented experimental feature to use gRPC stream connection for push requests. This can be enabled by setting `-distributor.use-stream-push=true`. #6580
 * [ENHANCEMENT] Query Frontend: Change to return 400 when the tenant resolving fail. #6715
 * [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. #6681
 * [ENHANCEMENT] Ingester: Add a `cortex_ingester_active_native_histogram_series` metric to track # of active NH series. #6695

diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -2690,6 +2690,11 @@ ha_tracker:
 # CLI flag: -distributor.sign-write-requests
 [sign_write_requests: <boolean> | default = false]
 
+# EXPERIMENTAL: If enabled, distributor would use stream connection to send
+# requests to ingesters.
+# CLI flag: -distributor.use-stream-push
+[use_stream_push: <boolean> | default = false]
+
 ring:
   kvstore:
     # Backend storage to use for the ring. Supported values are: consul, etcd,

diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md
@@ -127,4 +127,6 @@ Currently experimental features are:
   - `-ingester.instance-limits.cpu-utilization`
   - `-ingester.instance-limits.heap-utilization`
   - `-store-gateway.instance-limits.cpu-utilization`
-  - `-store-gateway.instance-limits.heap-utilization`
+  - `-store-gateway.instance-limits.heap-utilization`
+- Distributor/Ingester: Stream push connection
+  - Enable stream push connection between distributor and ingester by setting `-distributor.use-stream-push=true` on Distributor.
diff --git a/integration/ingester_stream_push_test.go b/integration/ingester_stream_push_test.go
@@ -0,0 +1,118 @@
+//go:build requires_docker
+// +build requires_docker
+
+package integration
+
+import (
+	"fmt"
+	"math/rand"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/prompb"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/integration/e2e"
+	e2edb "github.com/cortexproject/cortex/integration/e2e/db"
+	"github.com/cortexproject/cortex/integration/e2ecortex"
+)
+
+func TestIngesterStreamPushConnection(t *testing.T) {
+
+	s, err := e2e.NewScenario(networkName)
+	require.NoError(t, err)
+	defer s.Close()
+
+	maxGlobalSeriesPerMetric := 300
+	maxGlobalSeriesPerTenant := 1000
+
+	flags := BlocksStorageFlags()
+	flags["-distributor.use-stream-push"] = "true"
+	flags["-distributor.replication-factor"] = "1"
+	flags["-distributor.shard-by-all-labels"] = "true"
+	flags["-distributor.sharding-strategy"] = "shuffle-sharding"
+	flags["-distributor.ingestion-tenant-shard-size"] = "1"
+	flags["-ingester.max-series-per-user"] = "0"
+	flags["-ingester.max-series-per-metric"] = "0"
+	flags["-ingester.max-global-series-per-user"] = strconv.Itoa(maxGlobalSeriesPerTenant)
+	flags["-ingester.max-global-series-per-metric"] = strconv.Itoa(maxGlobalSeriesPerMetric)
+	flags["-ingester.heartbeat-period"] = "1s"
+
+	// Start dependencies.
+	consul := e2edb.NewConsul()
+	minio := e2edb.NewMinio(9000, flags["-blocks-storage.s3.bucket-name"])
+	require.NoError(t, s.StartAndWaitReady(consul, minio))
+
+	// Start Cortex components.
+	distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	ingester1 := e2ecortex.NewIngester("ingester-1", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	ingester2 := e2ecortex.NewIngester("ingester-2", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	ingester3 := e2ecortex.NewIngester("ingester-3", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	require.NoError(t, s.StartAndWaitReady(distributor, ingester1, ingester2, ingester3))
+
+	// Wait until distributor has updated the ring.
+	require.NoError(t, distributor.WaitSumMetricsWithOptions(e2e.Equals(3), []string{"cortex_ring_members"}, e2e.WithLabelMatchers(
+		labels.MustNewMatcher(labels.MatchEqual, "name", "ingester"),
+		labels.MustNewMatcher(labels.MatchEqual, "state", "ACTIVE"))))
+
+	// Wait until ingesters have heartbeated the ring after all ingesters were active,
+	// in order to update the number of instances. Since we have no metric, we have to
+	// rely on a ugly sleep.
+	time.Sleep(2 * time.Second)
+
+	now := time.Now()
+	client, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", "", userID)
+	require.NoError(t, err)
+
+	numSeriesWithSameMetricName := 0
+	numSeriesTotal := 0
+	maxErrorsBeforeStop := 100
+
+	// Try to push as many series with the same metric name as we can.
+	for i, errs := 0, 0; i < 10000; i++ {
+		series, _ := generateSeries("test_limit_per_metric", now, prompb.Label{
+			Name:  "cardinality",
+			Value: strconv.Itoa(rand.Int()),
+		})
+
+		res, err := client.Push(series)
+		require.NoError(t, err)
+
+		if res.StatusCode == 200 {
+			numSeriesTotal++
+			numSeriesWithSameMetricName++
+		} else if errs++; errs >= maxErrorsBeforeStop {
+			break
+		}
+	}
+
+	// Try to push as many series with the different metric name as we can.
+	for i, errs := 0, 0; i < 10000; i++ {
+		series, _ := generateSeries(fmt.Sprintf("test_limit_per_tenant_%d", rand.Int()), now)
+		res, err := client.Push(series)
+		require.NoError(t, err)
+
+		if res.StatusCode == 200 {
+			numSeriesTotal++
+		} else if errs++; errs >= maxErrorsBeforeStop {
+			break
+		}
+	}
+
+	// We expect the number of series we've been successfully pushed to be around
+	// the limit. Due to how the global limit implementation works (lack of centralised
+	// coordination) the actual number of written series could be slightly different
+	// than the global limit, so we allow a 10% difference.
+	delta := 0.1
+	assert.InDelta(t, maxGlobalSeriesPerMetric, numSeriesWithSameMetricName, float64(maxGlobalSeriesPerMetric)*delta)
+	assert.InDelta(t, maxGlobalSeriesPerTenant, numSeriesTotal, float64(maxGlobalSeriesPerTenant)*delta)
+
+	// Ensure no service-specific metrics prefix is used by the wrong service.
+	assertServiceMetricsPrefixes(t, Distributor, distributor)
+	assertServiceMetricsPrefixes(t, Ingester, ingester1)
+	assertServiceMetricsPrefixes(t, Ingester, ingester2)
+	assertServiceMetricsPrefixes(t, Ingester, ingester3)
+}