Skip to content

Commit e2f037e

Browse files
rules: Add new RuleEvaluationTimeSum field to groups (prometheus#15672)
* feat(ruler): Add new `RuleEvaluationTimeSum` field to groups Coupled with a metric: `rule_group_last_rule_duration_sum_seconds` This will give us more observability into how fast a group runs with or without concurrency Signed-off-by: Julien Duchesne <[email protected]> * Update rules/group.go Co-authored-by: gotjosh <[email protected]> Signed-off-by: Julien Duchesne <[email protected]> Signed-off-by: Julien Duchesne <[email protected]> * Apply suggestions from code review Co-authored-by: gotjosh <[email protected]> Signed-off-by: Julien Duchesne <[email protected]> Signed-off-by: Julien Duchesne <[email protected]> * Remove `in seconds`. A duration is a duration Signed-off-by: Julien Duchesne <[email protected]> --------- Signed-off-by: Julien Duchesne <[email protected]> Signed-off-by: Julien Duchesne <[email protected]> Co-authored-by: gotjosh <[email protected]>
1 parent 7802ca2 commit e2f037e

File tree

3 files changed

+56
-17
lines changed

3 files changed

+56
-17
lines changed

rules/group.go

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,20 @@ import (
4444

4545
// Group is a set of rules that have a logical relation.
4646
type Group struct {
47-
name string
48-
file string
49-
interval time.Duration
50-
queryOffset *time.Duration
51-
limit int
52-
rules []Rule
53-
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
54-
staleSeries []labels.Labels
55-
opts *ManagerOptions
56-
mtx sync.Mutex
57-
evaluationTime time.Duration
58-
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
59-
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
47+
name string
48+
file string
49+
interval time.Duration
50+
queryOffset *time.Duration
51+
limit int
52+
rules []Rule
53+
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
54+
staleSeries []labels.Labels
55+
opts *ManagerOptions
56+
mtx sync.Mutex
57+
evaluationTime time.Duration // Time it took to evaluate the group.
58+
evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
59+
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
60+
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
6061

6162
shouldRestore bool
6263

@@ -115,6 +116,7 @@ func NewGroup(o GroupOptions) *Group {
115116
metrics.EvalFailures.WithLabelValues(key)
116117
metrics.GroupLastEvalTime.WithLabelValues(key)
117118
metrics.GroupLastDuration.WithLabelValues(key)
119+
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
118120
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
119121
metrics.GroupSamples.WithLabelValues(key)
120122
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
@@ -370,6 +372,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
370372
g.evaluationTime = dur
371373
}
372374

375+
// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
376+
func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
377+
g.mtx.Lock()
378+
defer g.mtx.Unlock()
379+
return g.evaluationRuleTimeSum
380+
}
381+
382+
// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
383+
// It collects the times from the rules themselves.
384+
func (g *Group) updateRuleEvaluationTimeSum() {
385+
var sum time.Duration
386+
for _, rule := range g.rules {
387+
sum += rule.GetEvaluationDuration()
388+
}
389+
390+
g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())
391+
392+
g.mtx.Lock()
393+
defer g.mtx.Unlock()
394+
g.evaluationRuleTimeSum = sum
395+
}
396+
373397
// GetLastEvaluation returns the time the last evaluation of the rule group took place.
374398
func (g *Group) GetLastEvaluation() time.Time {
375399
g.mtx.Lock()
@@ -874,6 +898,7 @@ type Metrics struct {
874898
GroupInterval *prometheus.GaugeVec
875899
GroupLastEvalTime *prometheus.GaugeVec
876900
GroupLastDuration *prometheus.GaugeVec
901+
GroupLastRuleDurationSum *prometheus.GaugeVec
877902
GroupLastRestoreDuration *prometheus.GaugeVec
878903
GroupRules *prometheus.GaugeVec
879904
GroupSamples *prometheus.GaugeVec
@@ -952,6 +977,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
952977
},
953978
[]string{"rule_group"},
954979
),
980+
GroupLastRuleDurationSum: prometheus.NewGaugeVec(
981+
prometheus.GaugeOpts{
982+
Namespace: namespace,
983+
Name: "rule_group_last_rule_duration_sum_seconds",
984+
Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
985+
},
986+
[]string{"rule_group"},
987+
),
955988
GroupLastRestoreDuration: prometheus.NewGaugeVec(
956989
prometheus.GaugeOpts{
957990
Namespace: namespace,
@@ -989,6 +1022,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
9891022
m.GroupInterval,
9901023
m.GroupLastEvalTime,
9911024
m.GroupLastDuration,
1025+
m.GroupLastRuleDurationSum,
9921026
m.GroupLastRestoreDuration,
9931027
m.GroupRules,
9941028
m.GroupSamples,

rules/manager.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
8282
timeSinceStart := time.Since(start)
8383

8484
g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
85+
g.updateRuleEvaluationTimeSum()
8586
g.setEvaluationTime(timeSinceStart)
8687
g.setLastEvaluation(start)
8788
g.setLastEvalTimestamp(evalTimestamp)

rules/manager_test.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1985,14 +1985,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {
19851985
require.Len(t, group.rules, ruleCount)
19861986

19871987
start := time.Now()
1988-
group.Eval(ctx, start)
1988+
DefaultEvalIterationFunc(ctx, group, start)
19891989

19901990
// Never expect more than 1 inflight query at a time.
19911991
require.EqualValues(t, 1, maxInflight.Load())
19921992
// Each rule should take at least 1 second to execute sequentially.
19931993
require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
19941994
// Each rule produces one vector.
19951995
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
1996+
// Group duration is higher than the sum of rule durations (group overhead).
1997+
require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
19961998
}
19971999
})
19982000

@@ -2023,7 +2025,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
20232025
require.Len(t, group.rules, ruleCount)
20242026

20252027
start := time.Now()
2026-
group.Eval(ctx, start)
2028+
DefaultEvalIterationFunc(ctx, group, start)
20272029

20282030
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
20292031
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@@ -2061,7 +2063,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
20612063
require.Len(t, group.rules, ruleCount)
20622064

20632065
start := time.Now()
2064-
group.Eval(ctx, start)
2066+
DefaultEvalIterationFunc(ctx, group, start)
20652067

20662068
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
20672069
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@@ -2100,14 +2102,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {
21002102

21012103
start := time.Now()
21022104

2103-
group.Eval(ctx, start)
2105+
DefaultEvalIterationFunc(ctx, group, start)
21042106

21052107
// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
21062108
require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
21072109
// Some rules should execute concurrently so should complete quicker.
21082110
require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
21092111
// Each rule produces one vector.
21102112
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
2113+
// Group duration is less than the sum of rule durations
2114+
require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
21112115
}
21122116
})
21132117

0 commit comments

Comments
 (0)