Skip to content

Introduce a regex tenant resolver #6713

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
* [FEATURE] Ingester: Support out-of-order native histogram ingestion. It automatically enabled when `-ingester.out-of-order-time-window > 0` and `-blocks-storage.tsdb.enable-native-histograms=true`. #6626 #6663
* [FEATURE] Ruler: Add support for percentage based sharding for rulers. #6680
* [FEATURE] Ruler: Add support for group labels. #6665
* [FEATURE] Query federation: Introduce a regex tenant resolver to allow regex in `X-Scope-OrgID` value. #6713
- Add a `tenant-federation.regex-matcher-enabled` flag. If it enabled, user can input regex to `X-Scope-OrgId`, the matched tenantIDs are automatically involved.
- Add a `tenant-federation.user-sync-interval` flag, it specifies how frequently to scan users. The scanned users are used to calculate matched tenantIDs.
* [ENHANCEMENT] Query Frontend: Change to return 400 when the tenant resolving fail. #6715
* [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. #6681
* [ENHANCEMENT] Ingester: Add a `cortex_ingester_active_native_histogram_series` metric to track # of active NH series. #6695
Expand Down
12 changes: 12 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,18 @@ tenant_federation:
# CLI flag: -tenant-federation.max-tenant
[max_tenant: <int> | default = 0]

# [Experimental] If enabled, the `X-Scope-OrgID` header value can accept a
# regex and the matched tenantIDs are automatically involved. The regex
# matching rule follows the Prometheus, see the detail:
# https://prometheus.io/docs/prometheus/latest/querying/basics/#regular-expressions.
# CLI flag: -tenant-federation.regex-matcher-enabled
[regex_matcher_enabled: <boolean> | default = false]

# If the regex matcher is enabled, it specifies how frequently to scan users.
# The scanned users are used to calculate matched tenantIDs.
# CLI flag: -tenant-federation.user-sync-interval
[user_sync_interval: <duration> | default = 5m]

# The ruler_config configures the Cortex ruler.
[ruler: <ruler_config>]

Expand Down
4 changes: 3 additions & 1 deletion docs/configuration/v1-guarantees.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ Currently experimental features are:
- Blocks storage bucket index
- The bucket index support in the querier and store-gateway (enabled via `-blocks-storage.bucket-store.bucket-index.enabled=true`) is experimental
- The block deletion marks migration support in the compactor (`-compactor.block-deletion-marks-migration-enabled`) is temporarily and will be removed in future versions
- Querier: tenant federation
- Querier:
- Tenant federation (`-tenant-federation.enabled`)
- Enable regex matcher when the tenant federation is enabled (`-tenant-federation.regex-matcher-enabled`)
- The thanosconvert tool for converting Thanos block metadata to Cortex
- HA Tracker: cleanup of old replicas from KV Store.
- Instance limits in ingester and distributor
Expand Down
139 changes: 139 additions & 0 deletions integration/querier_tenant_federation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,164 @@ type querierTenantFederationConfig struct {

func TestQuerierTenantFederation(t *testing.T) {
runQuerierTenantFederationTest(t, querierTenantFederationConfig{})
runQuerierTenantFederationTest_UseRegexResolver(t, querierTenantFederationConfig{})
}

func TestQuerierTenantFederationWithQueryScheduler(t *testing.T) {
runQuerierTenantFederationTest(t, querierTenantFederationConfig{
querySchedulerEnabled: true,
})
runQuerierTenantFederationTest_UseRegexResolver(t, querierTenantFederationConfig{
querySchedulerEnabled: true,
})
}

func TestQuerierTenantFederationWithShuffleSharding(t *testing.T) {
runQuerierTenantFederationTest(t, querierTenantFederationConfig{
shuffleShardingEnabled: true,
})
runQuerierTenantFederationTest_UseRegexResolver(t, querierTenantFederationConfig{
shuffleShardingEnabled: true,
})
}

func TestQuerierTenantFederationWithQuerySchedulerAndShuffleSharding(t *testing.T) {
runQuerierTenantFederationTest(t, querierTenantFederationConfig{
querySchedulerEnabled: true,
shuffleShardingEnabled: true,
})
runQuerierTenantFederationTest_UseRegexResolver(t, querierTenantFederationConfig{
querySchedulerEnabled: true,
shuffleShardingEnabled: true,
})
}

func runQuerierTenantFederationTest_UseRegexResolver(t *testing.T, cfg querierTenantFederationConfig) {
const numUsers = 10

s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

memcached := e2ecache.NewMemcached()
consul := e2edb.NewConsul()
require.NoError(t, s.StartAndWaitReady(consul, memcached))

flags := mergeFlags(BlocksStorageFlags(), map[string]string{
"-querier.cache-results": "true",
"-querier.split-queries-by-interval": "24h",
"-querier.query-ingesters-within": "12h", // Required by the test on query /series out of ingesters time range
"-frontend.memcached.addresses": "dns+" + memcached.NetworkEndpoint(e2ecache.MemcachedPort),
"-tenant-federation.enabled": "true",
"-tenant-federation.regex-matcher-enabled": "true",
"-tenant-federation.user-sync-interval": "1s",
})

// Start the query-scheduler if enabled.
var queryScheduler *e2ecortex.CortexService
if cfg.querySchedulerEnabled {
queryScheduler = e2ecortex.NewQueryScheduler("query-scheduler", flags, "")
require.NoError(t, s.StartAndWaitReady(queryScheduler))
flags["-frontend.scheduler-address"] = queryScheduler.NetworkGRPCEndpoint()
flags["-querier.scheduler-address"] = queryScheduler.NetworkGRPCEndpoint()
}

if cfg.shuffleShardingEnabled {
// Use only single querier for each user.
flags["-frontend.max-queriers-per-tenant"] = "1"
}

minio := e2edb.NewMinio(9000, flags["-blocks-storage.s3.bucket-name"])
require.NoError(t, s.StartAndWaitReady(minio))

// Start ingester and distributor.
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
require.NoError(t, s.StartAndWaitReady(ingester, distributor))

// Wait until distributor have updated the ring.
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))

// Push a series for each user to Cortex.
now := time.Now()
expectedVectors := make([]model.Vector, numUsers)
tenantIDs := make([]string, numUsers)

for u := 0; u < numUsers; u++ {
tenantIDs[u] = fmt.Sprintf("user-%d", u)
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", "", tenantIDs[u])
require.NoError(t, err)

var series []prompb.TimeSeries
series, expectedVectors[u] = generateSeries("series_1", now)

res, err := c.Push(series)
require.NoError(t, err)
require.Equal(t, 200, res.StatusCode)
}

// Start the query-frontend.
queryFrontend := e2ecortex.NewQueryFrontend("query-frontend", flags, "")
require.NoError(t, s.Start(queryFrontend))

if !cfg.querySchedulerEnabled {
flags["-querier.frontend-address"] = queryFrontend.NetworkGRPCEndpoint()
}
querier := e2ecortex.NewQuerier("querier", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")

var querier2 *e2ecortex.CortexService
if cfg.shuffleShardingEnabled {
querier2 = e2ecortex.NewQuerier("querier-2", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
}

// Start queriers.
require.NoError(t, s.StartAndWaitReady(querier))
require.NoError(t, s.WaitReady(queryFrontend))
if cfg.shuffleShardingEnabled {
require.NoError(t, s.StartAndWaitReady(querier2))
}

require.NoError(t, querier.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
if cfg.shuffleShardingEnabled {
require.NoError(t, querier2.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
}

// wait to update knownUsers
require.NoError(t, querier.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_regex_resolver_last_update_run_timestamp_seconds"}), e2e.WaitMissingMetrics)
if cfg.shuffleShardingEnabled {
require.NoError(t, querier2.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_regex_resolver_last_update_run_timestamp_seconds"}), e2e.WaitMissingMetrics)
}

// query all tenants
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), queryFrontend.HTTPEndpoint(), "", "", "user-.+")
require.NoError(t, err)

result, err := c.Query("series_1", now)
require.NoError(t, err)

assert.Equal(t, mergeResults(tenantIDs, expectedVectors), result.(model.Vector))

// ensure a push to multiple tenants is failing
series, _ := generateSeries("series_1", now)
res, err := c.Push(series)
require.NoError(t, err)

require.Equal(t, 500, res.StatusCode)

// check metric label values for total queries in the query frontend
require.NoError(t, queryFrontend.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_query_frontend_queries_total"}, e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "user", "user-.+"),
labels.MustNewMatcher(labels.MatchEqual, "op", "query"))))

// check metric label values for query queue length in either query frontend or query scheduler
queueComponent := queryFrontend
queueMetricName := "cortex_query_frontend_queue_length"
if cfg.querySchedulerEnabled {
queueComponent = queryScheduler
queueMetricName = "cortex_query_scheduler_queue_length"
}
require.NoError(t, queueComponent.WaitSumMetricsWithOptions(e2e.Equals(0), []string{queueMetricName}, e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "user", "user-.+"))))
}

func runQuerierTenantFederationTest(t *testing.T, cfg querierTenantFederationConfig) {
Expand Down
19 changes: 19 additions & 0 deletions pkg/cortex/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"github.com/cortexproject/cortex/pkg/scheduler"
"github.com/cortexproject/cortex/pkg/storage/bucket"
"github.com/cortexproject/cortex/pkg/storegateway"
"github.com/cortexproject/cortex/pkg/tenant"
"github.com/cortexproject/cortex/pkg/util/grpcclient"
util_log "github.com/cortexproject/cortex/pkg/util/log"
"github.com/cortexproject/cortex/pkg/util/modules"
Expand Down Expand Up @@ -280,6 +281,14 @@ func (t *Cortex) initTenantFederation() (serv services.Service, err error) {
// single tenant. This allows for a less impactful enabling of tenant
// federation.
byPassForSingleQuerier := true
if t.Cfg.TenantFederation.RegexMatcherEnabled {
util_log.WarnExperimentalUse("tenant-federation.regex-matcher-enabled")
// If regex matcher enabled, we should set the byPassForSingleQuerier as false
// because if the # of matched tenantIDs is only one, `X-Scope-OrgID` header is
// set to input regex.
byPassForSingleQuerier = false
tenant.WithDefaultResolver(tenantfederation.NewRegexResolver(prometheus.DefaultRegisterer, t.Cfg.TenantFederation.UserSyncInterval, util_log.Logger, t.Distributor.AllUserStats))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there is a better way to gather all users. Calling t.Distributor.AllUserStats seems a bit expensive just to get user IDs.

And it cannot cover users that don't ingest anymore but maybe their data still present on long term storage.

Copy link
Member Author

@SungJin1212 SungJin1212 Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And it cannot cover users that don't ingest anymore but maybe their data still present on long term storage.

Thanks for catching.
How about utilizing the userScanner?

Copy link
Member Author

@SungJin1212 SungJin1212 Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling t.Distributor.AllUserStats seems a bit expensive just to get user IDs.

Do you have any good ideas?

}
t.QuerierQueryable = querier.NewSampleAndChunkQueryable(tenantfederation.NewQueryable(t.QuerierQueryable, t.Cfg.TenantFederation.MaxConcurrent, byPassForSingleQuerier, prometheus.DefaultRegisterer))
t.MetadataQuerier = tenantfederation.NewMetadataQuerier(t.MetadataQuerier, t.Cfg.TenantFederation.MaxConcurrent, prometheus.DefaultRegisterer)
t.ExemplarQueryable = tenantfederation.NewExemplarQueryable(t.ExemplarQueryable, t.Cfg.TenantFederation.MaxConcurrent, byPassForSingleQuerier, prometheus.DefaultRegisterer)
Expand Down Expand Up @@ -486,6 +495,11 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro
shardedPrometheusCodec := queryrange.NewPrometheusCodec(true, t.Cfg.Querier.ResponseCompression, t.Cfg.API.QuerierDefaultCodec)
instantQueryCodec := instantquery.NewInstantQueryCodec(t.Cfg.Querier.ResponseCompression, t.Cfg.API.QuerierDefaultCodec)

if t.Cfg.TenantFederation.Enabled && t.Cfg.TenantFederation.RegexMatcherEnabled {
// If regex matcher enabled, we use regex validator to pass regex to the querier
tenant.WithDefaultResolver(tenantfederation.NewRegexValidator())
}

queryRangeMiddlewares, cache, err := queryrange.Middlewares(
t.Cfg.QueryRange,
util_log.Logger,
Expand Down Expand Up @@ -760,6 +774,11 @@ func (t *Cortex) initTenantDeletionAPI() (services.Service, error) {
}

func (t *Cortex) initQueryScheduler() (services.Service, error) {
if t.Cfg.TenantFederation.Enabled && t.Cfg.TenantFederation.RegexMatcherEnabled {
// If regex matcher enabled, we use regex validator to pass regex to the querier
tenant.WithDefaultResolver(tenantfederation.NewRegexValidator())
}

s, err := scheduler.NewScheduler(t.Cfg.QueryScheduler, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer)
if err != nil {
return nil, errors.Wrap(err, "query-scheduler init")
Expand Down
110 changes: 110 additions & 0 deletions pkg/querier/tenantfederation/exemplar_merge_queryable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import (
"errors"
"strings"
"testing"
"time"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/prometheus/model/exemplar"
Expand All @@ -14,7 +16,9 @@ import (
"github.com/stretchr/testify/require"
"github.com/weaveworks/common/user"

"github.com/cortexproject/cortex/pkg/ingester"
"github.com/cortexproject/cortex/pkg/tenant"
"github.com/cortexproject/cortex/pkg/util/test"
)

var (
Expand Down Expand Up @@ -311,6 +315,112 @@ func Test_MergeExemplarQuerier_Select(t *testing.T) {
}
}

func Test_MergeExemplarQuerier_Select_WhenUseRegexResolver(t *testing.T) {
// set a regex tenant resolver
reg := prometheus.NewRegistry()
userStat := ingester.UserStats{}
regexResolver := NewRegexResolver(reg, time.Second, log.NewNopLogger(), func(ctx context.Context) ([]ingester.UserIDStats, error) {
return []ingester.UserIDStats{
{UserID: "user-1", UserStats: userStat},
{UserID: "user-2", UserStats: userStat},
}, nil
})
tenant.WithDefaultResolver(regexResolver)

// wait update knownUsers
test.Poll(t, time.Second*10, true, func() interface{} {
return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0
})

tests := []struct {
name string
upstream mockExemplarQueryable
matcher [][]*labels.Matcher
orgId string
expectedResult []exemplar.QueryResult
expectedErr error
expectedMetrics string
}{
{
name: "result labels should contains __tenant_id__ even if one tenant is queried",
upstream: mockExemplarQueryable{exemplarQueriers: map[string]storage.ExemplarQuerier{
"user-1": &mockExemplarQuerier{res: getFixtureExemplarResult1()},
"user-2": &mockExemplarQuerier{res: getFixtureExemplarResult2()},
}},
matcher: [][]*labels.Matcher{{
labels.MustNewMatcher(labels.MatchEqual, "__name__", "exemplar_series"),
}},
orgId: ".+-1",
expectedResult: []exemplar.QueryResult{
{
SeriesLabels: labels.FromStrings("__name__", "exemplar_series", "__tenant_id__", "user-1"),
Exemplars: []exemplar.Exemplar{
{
Labels: labels.FromStrings("traceID", "123"),
Value: 123,
Ts: 1734942337900,
},
},
},
},
expectedMetrics: expectedSingleTenantsExemplarMetrics,
},
{
name: "two tenants results should be aggregated",
upstream: mockExemplarQueryable{exemplarQueriers: map[string]storage.ExemplarQuerier{
"user-1": &mockExemplarQuerier{res: getFixtureExemplarResult1()},
"user-2": &mockExemplarQuerier{res: getFixtureExemplarResult2()},
}},
matcher: [][]*labels.Matcher{{
labels.MustNewMatcher(labels.MatchEqual, "__name__", "exemplar_series"),
}},
orgId: "user-.+",
expectedResult: []exemplar.QueryResult{
{
SeriesLabels: labels.FromStrings("__name__", "exemplar_series", "__tenant_id__", "user-1"),
Exemplars: []exemplar.Exemplar{
{
Labels: labels.FromStrings("traceID", "123"),
Value: 123,
Ts: 1734942337900,
},
},
},
{
SeriesLabels: labels.FromStrings("__name__", "exemplar_series", "__tenant_id__", "user-2"),
Exemplars: []exemplar.Exemplar{
{
Labels: labels.FromStrings("traceID", "456"),
Value: 456,
Ts: 1734942338000,
},
},
},
},
expectedMetrics: expectedTwoTenantsExemplarMetrics,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
exemplarQueryable := NewExemplarQueryable(&test.upstream, defaultMaxConcurrency, false, reg)
ctx := user.InjectOrgID(context.Background(), test.orgId)
q, err := exemplarQueryable.ExemplarQuerier(ctx)
require.NoError(t, err)

result, err := q.Select(mint, maxt, test.matcher...)
if test.expectedErr != nil {
require.Error(t, err)
} else {
require.NoError(t, err)
require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(test.expectedMetrics), "cortex_querier_federated_tenants_per_exemplar_query"))
require.Equal(t, test.expectedResult, result)
}
})
}
}

func Test_filterAllTenantsAndMatchers(t *testing.T) {
idLabelName := defaultTenantLabel

Expand Down
Loading
Loading