Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

option to resolve alerts that fired inhibited #3034

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ func run() int {
intervener,
notificationLog,
pipelinePeer,
conf.Global.ResolveInhibited,
)

configuredReceivers.Set(float64(len(activeReceivers)))
Expand Down
3 changes: 2 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,8 @@ func (hp HostPort) String() string {
type GlobalConfig struct {
// ResolveTimeout is the time after which an alert is declared resolved
// if it has not been updated.
ResolveTimeout model.Duration `yaml:"resolve_timeout" json:"resolve_timeout"`
ResolveTimeout model.Duration `yaml:"resolve_timeout" json:"resolve_timeout"`
ResolveInhibited bool `yaml:"resolve_inhibited,omitempty" json:"resolve_inhibited,omitempty"`

HTTPConfig *commoncfg.HTTPClientConfig `yaml:"http_config,omitempty" json:"http_config,omitempty"`

Expand Down
24 changes: 16 additions & 8 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,15 @@ func (pb *PipelineBuilder) New(
intervener *timeinterval.Intervener,
notificationLog NotificationLog,
peer Peer,
resolveInhibited bool,
) RoutingStage {
rs := make(RoutingStage, len(receivers))

ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
tas := NewTimeActiveStage(intervener)
tms := NewTimeMuteStage(intervener)
ss := NewMuteStage(silencer)
is := NewMuteStage(inhibitor, resolveInhibited)
ss := NewMuteStage(silencer, resolveInhibited)

for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
Expand Down Expand Up @@ -509,12 +510,13 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t

// MuteStage filters alerts through a Muter.
type MuteStage struct {
muter types.Muter
muter types.Muter
resolveInhibited bool
}

// NewMuteStage return a new MuteStage.
func NewMuteStage(m types.Muter) *MuteStage {
return &MuteStage{muter: m}
func NewMuteStage(m types.Muter, resolveInhibited bool) *MuteStage {
return &MuteStage{muter: m, resolveInhibited: resolveInhibited}
}

// Exec implements the Stage interface.
Expand All @@ -525,11 +527,17 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
)
for _, a := range alerts {
// TODO(fabxc): increment total alerts counter.
isMuted := n.muter.Mutes(a.Labels)
if !isMuted && !a.Resolved() && n.resolveInhibited {
a.FiredUnmuted = true
}
// Do not send the alert if muted.
if n.muter.Mutes(a.Labels) {
muted = append(muted, a)
} else {
// Do send resolved notifications if configured
// to do so and the alert fired unmuted.
if !isMuted || (a.Resolved() && a.FiredUnmuted && n.resolveInhibited) {
filtered = append(filtered, a)
} else {
muted = append(muted, a)
Comment on lines +530 to +540
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i actually find this hunk a little confusing. why are we flipping a.FiredUnmuted here? the conditional looks *really* similar to the second one there, and, what's worse, the a.FireUnmuted` is reused there which, effectively, makes the second conditional look like:

!isMuted || (a.Resolved() && (!isMuted && !a.Resolved() && n.resolveInhibited) && n.resolveInhibited) 

which i think is equivalent to:

!isMuted || (!a.FiredUnmuted && a.Resolved() && && n.resolveInhibited) 

before a.FiredUnmuted is reset. i wonder if there's a way to make this code more readable, perhaps with a comment explaing why the FiredUnmuted flag is changed there?

}
// TODO(fabxc): increment muted alerts counter if muted.
}
Expand Down
50 changes: 48 additions & 2 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ func TestMuteStage(t *testing.T) {
return ok
})

stage := NewMuteStage(muter)
stage := NewMuteStage(muter, false)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -707,6 +707,52 @@ func TestMuteStage(t *testing.T) {
}
}

func TestResolveMuted(t *testing.T) {
// test alert
alert := &types.Alert{
Alert: model.Alert{Labels: model.LabelSet{"mute": "me"}},
}

// fire alert unmuted
nonMuter := types.MuteFunc(func(lset model.LabelSet) bool {
return false
})

stage := NewMuteStage(nonMuter, true)

_, alerts, err := stage.Exec(context.Background(), log.NewNopLogger(), alert)
if err != nil {
t.Fatalf("Exec failed: %s", err)
}

if len(alerts) != 1 {
t.Fatalf("Did not receive alert back")
}

if !alerts[0].FiredUnmuted {
t.Fatalf("Alert did not have fired unmuted flag set")
}

alert.EndsAt = time.Now().Add(-time.Second)

// Mute all label sets that have a "mute" key.
muter := types.MuteFunc(func(lset model.LabelSet) bool {
_, ok := lset["mute"]
return ok
})

stage = NewMuteStage(muter, true)

_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), alert)
if err != nil {
t.Fatalf("Exec failed: %s", err)
}

if len(alerts) != 1 {
t.Fatalf("Did not receive muted resolved alert back")
}
}

func TestMuteStageWithSilences(t *testing.T) {
silences, err := silence.New(silence.Options{Retention: time.Hour})
if err != nil {
Expand All @@ -722,7 +768,7 @@ func TestMuteStageWithSilences(t *testing.T) {

marker := types.NewMarker(prometheus.NewRegistry())
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
stage := NewMuteStage(silencer)
stage := NewMuteStage(silencer, false)

in := []model.LabelSet{
{},
Expand Down
5 changes: 3 additions & 2 deletions types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,9 @@ type Alert struct {
model.Alert

// The authoritative timestamp.
UpdatedAt time.Time
Timeout bool
UpdatedAt time.Time
Timeout bool
FiredUnmuted bool
}

func validateLs(ls model.LabelSet) error {
Expand Down
Loading