From 3c073a684d2c3b80309f7320f7270a505da23b5d Mon Sep 17 00:00:00 2001 From: Abduh Date: Wed, 22 Feb 2023 18:32:49 +0700 Subject: [PATCH] feat: better grouping alert & pagerduty receiver fix (#172) * fix(alert): better grouping when sending notification * fix(pagerduty): proper incident key to resolve and silent when acked --- core/notification/builder.go | 103 ++++++----- core/notification/builder_test.go | 84 ++++++--- core/notification/notification.go | 1 + core/notification/utils.go | 45 +++++ core/notification/utils_test.go | 165 ++++++++++++++++++ go.mod | 1 + go.sum | 2 + internal/api/v1beta1/alert.go | 11 +- internal/store/model/notification.go | 8 + ...d_notifications_column_unique_key.down.sql | 3 + ...add_notifications_column_unique_key.up.sql | 5 + internal/store/postgres/notification.go | 5 +- .../postgres/testdata/mock-notification.json | 2 + plugins/receivers/pagerduty/client.go | 6 +- plugins/receivers/pagerduty/client_test.go | 2 +- .../default_alert_template_body_v1.goyaml | 2 +- 16 files changed, 359 insertions(+), 86 deletions(-) create mode 100644 core/notification/utils.go create mode 100644 core/notification/utils_test.go create mode 100644 internal/store/postgres/migrations/000010_add_notifications_column_unique_key.down.sql create mode 100644 internal/store/postgres/migrations/000010_add_notifications_column_unique_key.up.sql diff --git a/core/notification/builder.go b/core/notification/builder.go index 4cb90154..94a37c42 100644 --- a/core/notification/builder.go +++ b/core/notification/builder.go @@ -28,62 +28,70 @@ import ( // - alertname // - (others labels defined in rules) func BuildFromAlerts( - as []alert.Alert, + alerts []alert.Alert, firingLen int, createdTime time.Time, -) Notification { - if len(as) == 0 { - return Notification{} +) ([]Notification, error) { + if len(alerts) == 0 { + return nil, errors.New("empty alerts") } - sampleAlert := as[0] + alertsMap, err := groupByLabels(alerts) + if err != nil { + return nil, err + } + + var notifications []Notification - data := map[string]interface{}{} + for hashKey, groupedAlerts := range alertsMap { + sampleAlert := groupedAlerts[0] - mergedAnnotations := map[string][]string{} - for _, a := range as { - for k, v := range a.Annotations { - mergedAnnotations[k] = append(mergedAnnotations[k], v) + data := map[string]interface{}{} + + mergedAnnotations := map[string][]string{} + for _, a := range groupedAlerts { + for k, v := range a.Annotations { + mergedAnnotations[k] = append(mergedAnnotations[k], v) + } } - } - // make unique - for k, v := range mergedAnnotations { - mergedAnnotations[k] = removeDuplicateStringValues(v) - } - // render annotations - for k, vSlice := range mergedAnnotations { - for _, v := range vSlice { - if _, ok := data[k]; ok { - data[k] = fmt.Sprintf("%s\n%s", data[k], v) - } else { - data[k] = v + // make unique + for k, v := range mergedAnnotations { + mergedAnnotations[k] = removeDuplicateStringValues(v) + } + // render annotations + for k, vSlice := range mergedAnnotations { + for _, v := range vSlice { + if _, ok := data[k]; ok { + data[k] = fmt.Sprintf("%s\n%s", data[k], v) + } else { + data[k] = v + } } } - } - data["status"] = sampleAlert.Status - data["generator_url"] = sampleAlert.GeneratorURL - data["num_alerts_firing"] = firingLen + data["status"] = sampleAlert.Status + data["generator_url"] = sampleAlert.GeneratorURL + data["num_alerts_firing"] = firingLen - labels := map[string]string{} - alertIDs := []int64{} + alertIDs := []int64{} - for _, a := range as { - alertIDs = append(alertIDs, int64(a.ID)) - for k, v := range a.Labels { - labels[k] = v + for _, a := range groupedAlerts { + alertIDs = append(alertIDs, int64(a.ID)) } - } - return Notification{ - NamespaceID: sampleAlert.NamespaceID, - Type: TypeSubscriber, - Data: data, - Labels: labels, - Template: template.ReservedName_SystemDefault, - CreatedAt: createdTime, - AlertIDs: alertIDs, + notifications = append(notifications, Notification{ + NamespaceID: sampleAlert.NamespaceID, + Type: TypeSubscriber, + Data: data, + Labels: sampleAlert.Labels, + Template: template.ReservedName_SystemDefault, + UniqueKey: hashGroupKey(sampleAlert.GroupKey, hashKey), + CreatedAt: createdTime, + AlertIDs: alertIDs, + }) } + + return notifications, nil } // BuildTypeReceiver builds a notification struct with receiver type flow @@ -115,16 +123,3 @@ func BuildTypeReceiver(receiverID uint64, payloadMap map[string]interface{}) (No return n, nil } - -func removeDuplicateStringValues(strSlice []string) []string { - keys := make(map[string]bool) - list := []string{} - - for _, v := range strSlice { - if _, value := keys[v]; !value { - keys[v] = true - list = append(list, v) - } - } - return list -} diff --git a/core/notification/builder_test.go b/core/notification/builder_test.go index b72e5840..2cdb521f 100644 --- a/core/notification/builder_test.go +++ b/core/notification/builder_test.go @@ -5,6 +5,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/odpf/siren/core/alert" "github.com/odpf/siren/core/notification" "github.com/odpf/siren/core/template" @@ -86,17 +87,18 @@ func TestBuildFromAlerts(t *testing.T) { name string alerts []alert.Alert firingLen int - want notification.Notification + want []notification.Notification + errString string }{ { - name: "should return empty notification if alerts slice is empty", - want: notification.Notification{}, + name: "should return empty notification if alerts slice is empty", + errString: "empty alerts", }, { name: `should properly return notification - same annotations are joined by newline - - labels are merged + - different labels are splitted into two notifications `, alerts: []alert.Alert{ { @@ -121,38 +123,72 @@ func TestBuildFromAlerts(t *testing.T) { MetricValue: "16", Severity: "WARNING", Rule: "test-alert-template", - Labels: map[string]string{"lk1": "lv11", "lk2": "lv2"}, + Labels: map[string]string{"lk1": "lv1", "lk2": "lv2"}, + Annotations: map[string]string{"ak1": "akv1"}, + Status: "FIRING", + }, + { + ID: 16, + ProviderID: 1, + NamespaceID: 1, + ResourceName: "test-alert-host-2", + MetricName: "test-alert", + MetricValue: "16", + Severity: "WARNING", + Rule: "test-alert-template", + Labels: map[string]string{"lk1": "lv1", "lk2": "lv2"}, Annotations: map[string]string{"ak1": "akv11", "ak2": "akv2"}, Status: "FIRING", }, }, firingLen: 2, - want: notification.Notification{ - NamespaceID: 1, - Type: notification.TypeSubscriber, - - Data: map[string]interface{}{ - "generator_url": "", - "num_alerts_firing": 2, - "status": "FIRING", - "ak1": "akv1\nakv11", - "ak2": "akv2", + want: []notification.Notification{ + { + NamespaceID: 1, + Type: notification.TypeSubscriber, + Data: map[string]interface{}{ + "generator_url": "", + "num_alerts_firing": 2, + "status": "FIRING", + "ak1": "akv1", + }, + Labels: map[string]string{ + "lk1": "lv1", + }, + UniqueKey: "ignored", + Template: template.ReservedName_SystemDefault, + AlertIDs: []int64{14}, }, - Labels: map[string]string{ - "lk1": "lv11", - "lk2": "lv2", + { + NamespaceID: 1, + Type: notification.TypeSubscriber, + + Data: map[string]interface{}{ + "generator_url": "", + "num_alerts_firing": 2, + "status": "FIRING", + "ak1": "akv1\nakv11", + "ak2": "akv2", + }, + Labels: map[string]string{ + "lk1": "lv1", + "lk2": "lv2", + }, + UniqueKey: "ignored", + Template: template.ReservedName_SystemDefault, + AlertIDs: []int64{15, 16}, }, - Template: template.ReservedName_SystemDefault, - AlertIDs: []int64{14, 15}, }, }, - {}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := notification.BuildFromAlerts(tt.alerts, tt.firingLen, time.Time{}) - - if diff := cmp.Diff(got, tt.want); diff != "" { + got, err := notification.BuildFromAlerts(tt.alerts, tt.firingLen, time.Time{}) + if (err != nil) && (err.Error() != tt.errString) { + t.Errorf("BuildTypeReceiver() error = %v, wantErr %s", err, tt.errString) + return + } + if diff := cmp.Diff(got, tt.want, cmpopts.IgnoreFields(notification.Notification{}, "ID", "UniqueKey")); diff != "" { t.Errorf("BuildFromAlerts() got diff = %v", diff) } }) diff --git a/core/notification/notification.go b/core/notification/notification.go index 8c15532f..08e77579 100644 --- a/core/notification/notification.go +++ b/core/notification/notification.go @@ -32,6 +32,7 @@ type Notification struct { Labels map[string]string `json:"labels"` ValidDuration time.Duration `json:"valid_duration"` Template string `json:"template"` + UniqueKey string `json:"unique_key"` CreatedAt time.Time `json:"created_at"` // won't be stored in notification table, only to propaget this to notification_subscriber diff --git a/core/notification/utils.go b/core/notification/utils.go new file mode 100644 index 00000000..e7328db1 --- /dev/null +++ b/core/notification/utils.go @@ -0,0 +1,45 @@ +package notification + +import ( + "crypto/sha256" + "fmt" + + "github.com/mitchellh/hashstructure/v2" + "github.com/odpf/siren/core/alert" +) + +func removeDuplicateStringValues(strSlice []string) []string { + keys := make(map[string]bool) + list := []string{} + + for _, v := range strSlice { + if _, value := keys[v]; !value { + keys[v] = true + list = append(list, v) + } + } + return list +} + +func groupByLabels(alerts []alert.Alert) (map[uint64][]alert.Alert, error) { + var alertsMap = map[uint64][]alert.Alert{} + + for _, a := range alerts { + hash, err := hashstructure.Hash(a.Labels, hashstructure.FormatV2, nil) + if err != nil { + return nil, fmt.Errorf("cannot get hash from alert %v", a) + } + alertsMap[hash] = append(alertsMap[hash], a) + } + + return alertsMap, nil +} + +// hashGroupKey hash groupKey from alert and hashKey from labels +func hashGroupKey(groupKey string, hashKey uint64) string { + h := sha256.New() + // hash.Hash.Write never returns an error. + //nolint: errcheck + h.Write([]byte(fmt.Sprintf("%s%d", groupKey, hashKey))) + return fmt.Sprintf("%x", h.Sum(nil)) +} diff --git a/core/notification/utils_test.go b/core/notification/utils_test.go new file mode 100644 index 00000000..3df46d4e --- /dev/null +++ b/core/notification/utils_test.go @@ -0,0 +1,165 @@ +package notification + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/mitchellh/hashstructure/v2" + "github.com/odpf/siren/core/alert" + "github.com/stretchr/testify/require" +) + +func Test_removeDuplicateStringValues(t *testing.T) { + tests := []struct { + name string + strSlice []string + want []string + }{ + { + name: "should remove duplicated string value in slice", + strSlice: []string{"a", "b", "c", "a"}, + want: []string{"a", "b", "c"}, + }, + { + name: "should return as-is if no duplicated string", + strSlice: []string{"a", "b", "c"}, + want: []string{"a", "b", "c"}, + }, + { + name: "should return empty slice if empty", + strSlice: []string{}, + want: []string{}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := removeDuplicateStringValues(tt.strSlice) + if diff := cmp.Diff(got, tt.want); diff != "" { + t.Errorf("got diff = %v", diff) + } + }) + } +} + +func Test_groupByLabels(t *testing.T) { + hashKey1, err := hashstructure.Hash(map[string]string{ + "key1": "val1", + "key2": "val2", + }, hashstructure.FormatV2, nil) + require.NoError(t, err) + + hashKey2, err := hashstructure.Hash(map[string]string{ + "key2": "val2", + "key3": "val3", + }, hashstructure.FormatV2, nil) + require.NoError(t, err) + + hashKey3, err := hashstructure.Hash(map[string]string{ + "key1": "val1", + "key3": "val3", + }, hashstructure.FormatV2, nil) + require.NoError(t, err) + + tests := []struct { + name string + alerts []alert.Alert + want map[uint64][]alert.Alert + wantErr bool + }{ + { + name: "shoudl group alerts if labels are same", + alerts: []alert.Alert{ + { + ID: 12, + Labels: map[string]string{ + "key1": "val1", + "key2": "val2", + }, + }, + { + ID: 34, + Labels: map[string]string{ + "key1": "val1", + "key2": "val2", + }, + }, + { + ID: 56, + Labels: map[string]string{ + "key2": "val2", + "key3": "val3", + }, + }, + { + ID: 78, + Labels: map[string]string{ + "key3": "val3", + "key2": "val2", + }, + }, + { + ID: 910, + Labels: map[string]string{ + "key1": "val1", + "key3": "val3", + }, + }, + }, + want: map[uint64][]alert.Alert{ + hashKey1: { + { + ID: 12, + Labels: map[string]string{ + "key1": "val1", + "key2": "val2", + }, + }, + { + ID: 34, + Labels: map[string]string{ + "key1": "val1", + "key2": "val2", + }, + }, + }, + hashKey2: { + { + ID: 56, + Labels: map[string]string{ + "key2": "val2", + "key3": "val3", + }, + }, + { + ID: 78, + Labels: map[string]string{ + "key3": "val3", + "key2": "val2", + }, + }, + }, + hashKey3: { + { + ID: 910, + Labels: map[string]string{ + "key1": "val1", + "key3": "val3", + }, + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := groupByLabels(tt.alerts) + if (err != nil) != tt.wantErr { + t.Errorf("groupByLabels() error = %v, wantErr %v", err, tt.wantErr) + return + } + if diff := cmp.Diff(got, tt.want); diff != "" { + t.Errorf("got diff = %v", diff) + } + }) + } +} diff --git a/go.mod b/go.mod index 27f16a5d..9a82a1fc 100644 --- a/go.mod +++ b/go.mod @@ -127,6 +127,7 @@ require ( github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/microcosm-cc/bluemonday v1.0.17 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.9.0 // indirect diff --git a/go.sum b/go.sum index 705c438a..0c85e17b 100644 --- a/go.sum +++ b/go.sum @@ -1632,6 +1632,8 @@ github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrk github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg= +github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4= +github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE= github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v0.0.0-20180220230111-00c29f56e238/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= diff --git a/internal/api/v1beta1/alert.go b/internal/api/v1beta1/alert.go index 23df4b60..d08e0dd3 100644 --- a/internal/api/v1beta1/alert.go +++ b/internal/api/v1beta1/alert.go @@ -87,10 +87,15 @@ func (s *GRPCServer) createAlerts(ctx context.Context, providerType string, prov if len(createdAlerts) > 0 { // Publish to notification service - n := notification.BuildFromAlerts(createdAlerts, firingLen, time.Now()) + ns, err := notification.BuildFromAlerts(createdAlerts, firingLen, time.Now()) + if err != nil { + s.logger.Warn("failed to build notifications from alert", "err", err, "alerts", createdAlerts) + } - if err := s.notificationService.Dispatch(ctx, n); err != nil { - s.logger.Warn("failed to send alert as notification", "err", err, "notification", n) + for _, n := range ns { + if err := s.notificationService.Dispatch(ctx, n); err != nil { + s.logger.Warn("failed to send alert as notification", "err", err, "notification", n) + } } } else { s.logger.Warn("failed to send alert as notification, empty created alerts") diff --git a/internal/store/model/notification.go b/internal/store/model/notification.go index 12a00920..23869076 100644 --- a/internal/store/model/notification.go +++ b/internal/store/model/notification.go @@ -15,6 +15,7 @@ type Notification struct { Data pgc.StringInterfaceMap `db:"data"` Labels pgc.StringStringMap `db:"labels"` ValidDuration pgc.TimeDuration `db:"valid_duration"` + UniqueKey sql.NullString `db:"unique_key"` Template sql.NullString `db:"template"` CreatedAt time.Time `db:"created_at"` } @@ -38,6 +39,12 @@ func (n *Notification) FromDomain(d notification.Notification) { n.Template = sql.NullString{String: d.Template, Valid: true} } + if d.UniqueKey == "" { + n.UniqueKey = sql.NullString{Valid: false} + } else { + n.UniqueKey = sql.NullString{String: d.UniqueKey, Valid: true} + } + n.CreatedAt = d.CreatedAt } @@ -50,6 +57,7 @@ func (n *Notification) ToDomain() notification.Notification { Labels: n.Labels, ValidDuration: time.Duration(n.ValidDuration), Template: n.Template.String, + UniqueKey: n.UniqueKey.String, CreatedAt: n.CreatedAt, } } diff --git a/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.down.sql b/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.down.sql new file mode 100644 index 00000000..9f869468 --- /dev/null +++ b/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.down.sql @@ -0,0 +1,3 @@ +ALTER TABLE + notifications +DROP COLUMN IF EXISTS unique_key; \ No newline at end of file diff --git a/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.up.sql b/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.up.sql new file mode 100644 index 00000000..9b9ddb8e --- /dev/null +++ b/internal/store/postgres/migrations/000010_add_notifications_column_unique_key.up.sql @@ -0,0 +1,5 @@ +ALTER TABLE + notifications +ADD COLUMN IF NOT EXISTS unique_key text; + +CREATE INDEX IF NOT EXISTS notifications_idx_unique_key ON notifications(unique_key); \ No newline at end of file diff --git a/internal/store/postgres/notification.go b/internal/store/postgres/notification.go index 11eaea68..145a9ba1 100644 --- a/internal/store/postgres/notification.go +++ b/internal/store/postgres/notification.go @@ -9,8 +9,8 @@ import ( ) const notificationInsertQuery = ` -INSERT INTO notifications (namespace_id, type, data, labels, valid_duration, template, created_at) - VALUES ($1, $2, $3, $4, $5, $6, now()) +INSERT INTO notifications (namespace_id, type, data, labels, valid_duration, template, unique_key, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, now()) RETURNING * ` @@ -40,6 +40,7 @@ func (r *NotificationRepository) Create(ctx context.Context, n notification.Noti nModel.Labels, nModel.ValidDuration, nModel.Template, + nModel.UniqueKey, ).StructScan(&newNModel); err != nil { return notification.Notification{}, err } diff --git a/internal/store/postgres/testdata/mock-notification.json b/internal/store/postgres/testdata/mock-notification.json index 1e18e99c..bd69531a 100644 --- a/internal/store/postgres/testdata/mock-notification.json +++ b/internal/store/postgres/testdata/mock-notification.json @@ -1,5 +1,6 @@ [ { + "id": "123", "namespace_id": 1, "type": "receiver", "data": { @@ -9,6 +10,7 @@ "template": "" }, { + "id": "456", "namespace_id": 1, "type": "subscriber", "data": { diff --git a/plugins/receivers/pagerduty/client.go b/plugins/receivers/pagerduty/client.go index bd5b318b..e160a954 100644 --- a/plugins/receivers/pagerduty/client.go +++ b/plugins/receivers/pagerduty/client.go @@ -99,7 +99,11 @@ func (c *Client) notifyV1(ctx context.Context, message MessageV1) error { } if resp.StatusCode >= 300 { - return errors.New(http.StatusText(resp.StatusCode)) + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("error with status code %s without response body", http.StatusText(resp.StatusCode)) + } + return fmt.Errorf("error with status code %s and body %s", http.StatusText(resp.StatusCode), string(bodyBytes)) } else { // Status code 2xx only bodyBytes, err := io.ReadAll(resp.Body) diff --git a/plugins/receivers/pagerduty/client_test.go b/plugins/receivers/pagerduty/client_test.go index e689580f..443a878e 100644 --- a/plugins/receivers/pagerduty/client_test.go +++ b/plugins/receivers/pagerduty/client_test.go @@ -68,7 +68,7 @@ func TestClient_NotifyV1_HTTPCall(t *testing.T) { c := pagerduty.NewClient(pagerduty.AppConfig{APIHost: testServer.URL}) err := c.NotifyV1(context.Background(), pagerduty.MessageV1{}) - assert.EqualError(t, err, "Bad Request") + assert.EqualError(t, err, "error with status code Bad Request and body ") testServer.Close() }) diff --git a/plugins/receivers/pagerduty/config/default_alert_template_body_v1.goyaml b/plugins/receivers/pagerduty/config/default_alert_template_body_v1.goyaml index 87a3b278..5f872bec 100644 --- a/plugins/receivers/pagerduty/config/default_alert_template_body_v1.goyaml +++ b/plugins/receivers/pagerduty/config/default_alert_template_body_v1.goyaml @@ -8,7 +8,7 @@ [[- end]] [[- end]] event_type: "[[template "pagerduty.event_type" . ]]" -[[if .Data.id]]incident_key: "[[.Data.id]]"[[ end ]] +[[if .Data.id]]incident_key: "[[.UniqueKey]]"[[ end ]] description: ([[ .Data.status | toUpper ]][[ if eq .Data.status "firing" ]]:[[ .Data.num_alerts_firing ]][[ end ]]) [[- if eq .Data.status "resolved" ]] ~([[ .Labels.severity | toUpper ]])~ [[- else ]] *([[ .Labels.severity | toUpper ]])*