Skip to content

Commit 817a887

Browse files
author
Cairry
committed
♻️ refactor(alert): remove silence status related logic
1 parent 3b7ece5 commit 817a887

File tree

9 files changed

+70
-120
lines changed

9 files changed

+70
-120
lines changed

alert/consumer/consumer.go

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"runtime/debug"
88
"sync"
99
"time"
10-
"watchAlert/alert/mute"
1110
"watchAlert/alert/process"
1211
"watchAlert/internal/ctx"
1312
"watchAlert/internal/models"
@@ -215,22 +214,15 @@ func (c *Consume) filterAlertEvents(faultCenter models.FaultCenter, alerts map[s
215214
continue
216215
}
217216

218-
// 过滤掉 预告警, 待恢复 状态的事件
219-
if event.Status == models.StatePreAlert || event.Status == models.StatePendingRecovery {
220-
continue
221-
}
222-
223-
if c.isMutedEvent(event, faultCenter) {
224-
// 当告警处于静默状态时触发了恢复告警,直接移除即可 不需要发送消息。
225-
if event.Status == models.StateRecovered {
226-
c.ctx.Redis.Alert().RemoveAlertEvent(event.TenantId, event.FaultCenterId, event.Fingerprint)
217+
// 过滤掉 非告警中 状态的事件
218+
if event.Status != models.StateAlerting {
219+
if event.IsRecovered {
220+
c.removeAlertFromCache(event)
221+
if err := process.RecordAlertHisEvent(c.ctx, *event); err != nil {
222+
logc.Error(c.ctx.Ctx, fmt.Sprintf("Failed to record alert history: %v", err))
223+
}
227224
}
228225

229-
// 如果是在静默范围内,但当前状态是非静默状态,则更新该告警为静默状态
230-
if event.Status != models.StateSilenced {
231-
event.TransitionStatus(models.StateSilenced)
232-
c.ctx.Redis.Alert().PushAlertEvent(event)
233-
}
234226
continue
235227
}
236228

@@ -242,18 +234,6 @@ func (c *Consume) filterAlertEvents(faultCenter models.FaultCenter, alerts map[s
242234
return newEvents
243235
}
244236

245-
// isMutedEvent 静默检查
246-
func (c *Consume) isMutedEvent(event *models.AlertCurEvent, faultCenter models.FaultCenter) bool {
247-
return mute.IsMuted(mute.MuteParams{
248-
EffectiveTime: event.EffectiveTime,
249-
IsRecovered: event.IsRecovered,
250-
TenantId: event.TenantId,
251-
Labels: event.Labels,
252-
FaultCenterId: event.FaultCenterId,
253-
RecoverNotify: faultCenter.RecoverNotify,
254-
})
255-
}
256-
257237
// validateEvent 事件验证
258238
func (c *Consume) validateEvent(event *models.AlertCurEvent, faultCenter models.FaultCenter) bool {
259239
return event.IsRecovered || event.LastSendTime == 0 ||
@@ -279,12 +259,6 @@ func (c *Consume) alarmGrouping(faultCenter models.FaultCenter, alertGroups *Ale
279259
}
280260

281261
alertGroups.AddAlert(stateId, alert, faultCenter)
282-
if alert.IsRecovered {
283-
c.removeAlertFromCache(alert)
284-
if err := process.RecordAlertHisEvent(c.ctx, *alert); err != nil {
285-
logc.Error(c.ctx.Ctx, fmt.Sprintf("Failed to record alert history: %v", err))
286-
}
287-
}
288262
}
289263
}
290264

@@ -304,7 +278,7 @@ func (c *Consume) sendAlerts(faultCenter models.FaultCenter, aggEvents *AlertGro
304278
func (c *Consume) processAlertGroup(faultCenter models.FaultCenter, noticeId string, alerts []*models.AlertCurEvent) {
305279
g := new(errgroup.Group)
306280
g.Go(func() error { return c.handleSubscribe(alerts) })
307-
g.Go(func() error { return process.HandleAlert(c.ctx, "alarm", faultCenter, noticeId, alerts) })
281+
g.Go(func() error { return handleAlert(c.ctx, "alarm", faultCenter, noticeId, alerts) })
308282

309283
if err := g.Wait(); err != nil {
310284
logc.Errorf(c.ctx.Ctx, "Alert group processing failed: %v", err)
@@ -352,15 +326,15 @@ func (c *Consume) processSilenceRule(faultCenter models.FaultCenter) {
352326
// 获取静默列表中所有的id
353327
silenceIds, err := silenceCtx.GetAlertMutes(faultCenter.TenantId, faultCenter.ID)
354328
if err != nil {
355-
logc.Errorf(ctx.Ctx, err.Error())
329+
logc.Error(ctx.Ctx, err.Error())
356330
return
357331
}
358332

359333
// 根据ID获取到详细的静默规则
360334
for _, silenceId := range silenceIds {
361335
muteRule, err := silenceCtx.WithIdGetMuteFromCache(faultCenter.TenantId, faultCenter.ID, silenceId)
362336
if err != nil {
363-
logc.Errorf(ctx.Ctx, err.Error())
337+
logc.Error(ctx.Ctx, err.Error())
364338
return
365339
}
366340

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
package process
1+
package consumer
22

33
import (
44
"fmt"
55
"slices"
66
"strings"
77
"time"
8+
"watchAlert/alert/mute"
89
"watchAlert/internal/ctx"
910
"watchAlert/internal/models"
1011
"watchAlert/pkg/sender"
@@ -15,8 +16,8 @@ import (
1516
"golang.org/x/sync/errgroup"
1617
)
1718

18-
// HandleAlert 处理告警逻辑
19-
func HandleAlert(ctx *ctx.Context, processType string, faultCenter models.FaultCenter, noticeId string, alerts []*models.AlertCurEvent) error {
19+
// handleAlert 处理告警逻辑
20+
func handleAlert(ctx *ctx.Context, processType string, faultCenter models.FaultCenter, noticeId string, alerts []*models.AlertCurEvent) error {
2021
curTime := time.Now().Unix()
2122
g := new(errgroup.Group)
2223

@@ -50,6 +51,10 @@ func HandleAlert(ctx *ctx.Context, processType string, faultCenter models.FaultC
5051
// 获取当前事件等级对应的路由配置
5152
routes := getNoticeRoutes(noticeData, severity)
5253
for _, event := range events {
54+
if event.Fingerprint == "" {
55+
continue
56+
}
57+
5358
if processType == "alarm" && !event.IsRecovered {
5459
event.LastSendTime = curTime
5560
ctx.Redis.Alert().PushAlertEvent(event)
@@ -59,9 +64,20 @@ func HandleAlert(ctx *ctx.Context, processType string, faultCenter models.FaultC
5964
logc.Infof(ctx.Ctx, "没用匹配的通知策略, 告警事件名称: %s, 通知对象名称: %s", event.RuleName, noticeData.Name)
6065
}
6166

67+
if mute.IsMuted(mute.MuteParams{
68+
EffectiveTime: event.EffectiveTime,
69+
IsRecovered: event.IsRecovered,
70+
TenantId: event.TenantId,
71+
Labels: event.Labels,
72+
FaultCenterId: event.FaultCenterId,
73+
RecoverNotify: faultCenter.RecoverNotify,
74+
}) {
75+
continue
76+
}
77+
6278
for _, route := range routes {
6379
// 设置值班用户信息
64-
event.DutyUser = strings.Join(GetDutyUsers(ctx, noticeData, route.NoticeType), " ")
80+
event.DutyUser = strings.Join(getDutyUsers(ctx, noticeData, route.NoticeType), " ")
6581

6682
// 生成告警内容
6783
content := generateAlertContent(ctx, event, noticeData, route)
@@ -136,10 +152,8 @@ func withRuleGroupByAlerts(ctx *ctx.Context, timeInt int64, alerts []*models.Ale
136152
}
137153
}
138154

139-
aggregatedAlert := alerts[0]
140-
aggregatedAlert.Annotations += fmt.Sprintf("\n聚合 %d 条消息,详情请前往 WatchAlert 查看\n", len(alerts))
141-
142-
return []*models.AlertCurEvent{aggregatedAlert}
155+
event := *alerts[0]
156+
return []*models.AlertCurEvent{&event}
143157
}
144158

145159
// getNoticeData 获取 Notice 数据
@@ -198,3 +212,34 @@ func generateAlertContent(ctx *ctx.Context, alert *models.AlertCurEvent, noticeD
198212
}
199213
return template.CardContentMsg
200214
}
215+
216+
func getDutyUsers(ctx *ctx.Context, noticeData models.AlertNotice, noticeType string) []string {
217+
var us []string
218+
users, ok := ctx.DB.DutyCalendar().GetDutyUserInfo(*noticeData.GetDutyId(), time.Now().Format("2006-1-2"))
219+
if ok {
220+
switch noticeType {
221+
case "FeiShu":
222+
for _, user := range users {
223+
us = append(us, fmt.Sprintf("<at id=%s></at>", user.DutyUserId))
224+
}
225+
return us
226+
case "DingDing":
227+
for _, user := range users {
228+
us = append(us, fmt.Sprintf("@%s", user.DutyUserId))
229+
}
230+
return us
231+
case "Email", "WeChat", "WebHook":
232+
for _, user := range users {
233+
us = append(us, fmt.Sprintf("@%s", user.UserName))
234+
}
235+
return us
236+
case "Slack":
237+
for _, user := range users {
238+
us = append(us, fmt.Sprintf("<@%s>", user.DutyUserId))
239+
}
240+
return us
241+
}
242+
}
243+
244+
return []string{"暂无"}
245+
}

alert/consumer/upgrader.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"fmt"
55
"time"
66
"watchAlert/alert/mute"
7-
"watchAlert/alert/process"
87
"watchAlert/internal/ctx"
98
"watchAlert/internal/models"
109

@@ -185,7 +184,7 @@ func sendAggregatedAlert(ctx *ctx.Context, faultCenter models.FaultCenter, aggre
185184
aggregated.Fingerprints,
186185
aggregated.Timeout))
187186

188-
return process.HandleAlert(ctx, "upgrade", faultCenter, noticeId, aggregated.Events)
187+
return handleAlert(ctx, "upgrade", faultCenter, noticeId, aggregated.Events)
189188
}
190189

191190
// getContent 生成聚合通知内容

alert/process/process.go

Lines changed: 1 addition & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ package process
22

33
import (
44
"fmt"
5-
"time"
6-
"watchAlert/alert/mute"
75
"watchAlert/internal/ctx"
86
"watchAlert/internal/models"
97

@@ -59,29 +57,13 @@ func PushEventToFaultCenter(ctx *ctx.Context, event *models.AlertCurEvent) {
5957
event.Status = currentStatus
6058
}
6159

62-
// 检查是否处于静默状态
63-
isSilenced := IsSilencedEvent(event)
64-
6560
// 根据不同情况处理状态转换
6661
switch event.Status {
6762
case models.StatePreAlert:
68-
// 如果需要静默
69-
if isSilenced {
70-
event.TransitionStatus(models.StateSilenced)
71-
} else if event.IsArriveForDuration() {
63+
if event.IsArriveForDuration() {
7264
// 如果达到持续时间,转为告警状态
7365
event.TransitionStatus(models.StateAlerting)
7466
}
75-
case models.StateAlerting:
76-
// 如果需要静默
77-
if isSilenced {
78-
event.TransitionStatus(models.StateSilenced)
79-
}
80-
case models.StateSilenced:
81-
// 如果不再静默,转换回预告警状态
82-
if !isSilenced {
83-
event.TransitionStatus(models.StateAlerting)
84-
}
8567
}
8668

8769
// 最终再次校验 fingerprint 非空,避免 push 时使用空 key
@@ -94,48 +76,6 @@ func PushEventToFaultCenter(ctx *ctx.Context, event *models.AlertCurEvent) {
9476
cache.Alert().PushAlertEvent(event)
9577
}
9678

97-
// IsSilencedEvent 静默检查
98-
func IsSilencedEvent(event *models.AlertCurEvent) bool {
99-
return mute.IsSilence(mute.MuteParams{
100-
EffectiveTime: event.EffectiveTime,
101-
IsRecovered: event.IsRecovered,
102-
TenantId: event.TenantId,
103-
Labels: event.Labels,
104-
FaultCenterId: event.FaultCenterId,
105-
})
106-
}
107-
108-
func GetDutyUsers(ctx *ctx.Context, noticeData models.AlertNotice, noticeType string) []string {
109-
var us []string
110-
users, ok := ctx.DB.DutyCalendar().GetDutyUserInfo(*noticeData.GetDutyId(), time.Now().Format("2006-1-2"))
111-
if ok {
112-
switch noticeType {
113-
case "FeiShu":
114-
for _, user := range users {
115-
us = append(us, fmt.Sprintf("<at id=%s></at>", user.DutyUserId))
116-
}
117-
return us
118-
case "DingDing":
119-
for _, user := range users {
120-
us = append(us, fmt.Sprintf("@%s", user.DutyUserId))
121-
}
122-
return us
123-
case "Email", "WeChat", "WebHook":
124-
for _, user := range users {
125-
us = append(us, fmt.Sprintf("@%s", user.UserName))
126-
}
127-
return us
128-
case "Slack":
129-
for _, user := range users {
130-
us = append(us, fmt.Sprintf("<@%s>", user.DutyUserId))
131-
}
132-
return us
133-
}
134-
}
135-
136-
return []string{"暂无"}
137-
}
138-
13979
// RecordAlertHisEvent 记录历史告警
14080
func RecordAlertHisEvent(ctx *ctx.Context, alert models.AlertCurEvent) error {
14181
hisData := models.AlertHisEvent{

internal/models/alert_current_event.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ const (
1515
StateAlerting AlertStatus = "alerting" // 告警中
1616
StatePendingRecovery AlertStatus = "pending_recovery" // 待恢复
1717
StateRecovered AlertStatus = "recovered" // 已恢复
18-
StateSilenced AlertStatus = "silenced" // 静默中
1918
)
2019

2120
type AlertCurEvent struct {
@@ -86,11 +85,10 @@ func (alert *AlertCurEvent) validateTransition(newState AlertStatus) error {
8685

8786
// 定义允许的状态转换规则
8887
allowedTransitions := map[AlertStatus][]AlertStatus{
89-
StatePreAlert: {StateAlerting, StateSilenced},
90-
StateAlerting: {StatePendingRecovery, StateSilenced},
88+
StatePreAlert: {StateAlerting},
89+
StateAlerting: {StatePendingRecovery},
9190
StatePendingRecovery: {StateAlerting, StateRecovered},
9291
StateRecovered: {StatePreAlert},
93-
StateSilenced: {StatePreAlert, StateAlerting, StatePendingRecovery, StateRecovered},
9492
}
9593

9694
// 检查转换是否允许
@@ -130,8 +128,6 @@ func (alert *AlertCurEvent) handleStateTransition(newState AlertStatus) error {
130128
alert.LastSendTime = 0
131129
alert.RecoverTime = now
132130
alert.IsRecovered = true
133-
134-
case StateSilenced:
135131
}
136132

137133
return nil

internal/models/fault_center.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ type FaultCenter struct {
2525
RecoverWaitTime int64 `json:"recoverWaitTime"` // 告警恢复等待时间,单位(秒)
2626
CurrentPreAlertNumber int64 `json:"currentPreAlertNumber" gorm:"-"`
2727
CurrentAlertNumber int64 `json:"currentAlertNumber" gorm:"-"`
28-
CurrentMuteNumber int64 `json:"currentMuteNumber" gorm:"-"`
2928
CurrentRecoverNumber int64 `json:"currentRecoverNumber" gorm:"-"`
3029
IsUpgradeEnabled *bool `json:"isUpgradeEnabled" gorm:"column:isUpgradeEnabled"`
3130
UpgradableSeverity []string `json:"upgradableSeverity" gorm:"column:upgradableSeverity;serializer:json"`

internal/services/fault_center.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,6 @@ func (f faultCenterService) List(req interface{}) (data interface{}, err interfa
168168
faultCenters[index].CurrentPreAlertNumber++
169169
case models.StateAlerting:
170170
faultCenters[index].CurrentAlertNumber++
171-
case models.StateSilenced:
172-
faultCenters[index].CurrentMuteNumber++
173171
case models.StatePendingRecovery:
174172
faultCenters[index].CurrentRecoverNumber++
175173
}

internal/types/faultCenter.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ type RequestFaultCenterCreate struct {
1616
RecoverWaitTime int64 `json:"recoverWaitTime"` // 告警恢复等待时间,单位(秒)
1717
CurrentPreAlertNumber int64 `json:"currentPreAlertNumber" gorm:"-"`
1818
CurrentAlertNumber int64 `json:"currentAlertNumber" gorm:"-"`
19-
CurrentMuteNumber int64 `json:"currentMuteNumber" gorm:"-"`
2019
CurrentRecoverNumber int64 `json:"currentRecoverNumber" gorm:"-"`
2120
IsUpgradeEnabled *bool `json:"isUpgradeEnabled" gorm:"column:isUpgradeEnabled"`
2221
UpgradableSeverity []string `json:"upgradableSeverity" gorm:"column:upgradableSeverity;serializer:json"`
@@ -38,7 +37,6 @@ type RequestFaultCenterUpdate struct {
3837
RecoverWaitTime int64 `json:"recoverWaitTime"` // 告警恢复等待时间,单位(秒)
3938
CurrentPreAlertNumber int64 `json:"currentPreAlertNumber" gorm:"-"`
4039
CurrentAlertNumber int64 `json:"currentAlertNumber" gorm:"-"`
41-
CurrentMuteNumber int64 `json:"currentMuteNumber" gorm:"-"`
4240
CurrentRecoverNumber int64 `json:"currentRecoverNumber" gorm:"-"`
4341
IsUpgradeEnabled *bool `json:"isUpgradeEnabled" gorm:"column:isUpgradeEnabled"`
4442
UpgradableSeverity []string `json:"upgradableSeverity" gorm:"column:upgradableSeverity;serializer:json"`

pkg/provider/check.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@ package provider
33
import (
44
"context"
55
"fmt"
6-
"github.com/zeromicro/go-zero/core/logc"
76
"watchAlert/internal/models"
7+
8+
"github.com/zeromicro/go-zero/core/logc"
89
)
910

1011
// HealthChecker 统一健康检查接口
@@ -82,7 +83,7 @@ func CheckDatasourceHealth(datasource models.AlertDataSource) (bool, error) {
8283

8384
// 统一日志记录方法
8485
func logDatasourceError(ds models.AlertDataSource, err error) {
85-
logc.Errorf(context.Background(), "Datasource error",
86+
logc.Error(context.Background(), "Datasource error",
8687
map[string]interface{}{
8788
"id": ds.ID,
8889
"name": ds.Name,

0 commit comments

Comments
 (0)