9
9
"sync"
10
10
"time"
11
11
12
+ "github.com/VictoriaMetrics/metrics"
12
13
"github.com/twmb/franz-go/pkg/kadm"
13
14
"github.com/twmb/franz-go/pkg/kgo"
14
15
)
@@ -48,10 +49,11 @@ type Server struct {
48
49
49
50
// SourcePool manages the source Kafka instances and consumption.
50
51
type SourcePool struct {
51
- cfg SourcePoolCfg
52
- client * kgo.Client
53
- log * slog.Logger
54
- topics []string
52
+ cfg SourcePoolCfg
53
+ client * kgo.Client
54
+ log * slog.Logger
55
+ metrics * metrics.Set
56
+ topics []string
55
57
56
58
offsets map [string ]map [int32 ]kgo.Offset
57
59
82
84
83
85
// NewSourcePool returns a controller instance that manages the lifecycle of a pool of N source (consumer)
84
86
// servers. The pool always attempts to find one healthy node for the relay to consume from.
85
- func NewSourcePool (cfg SourcePoolCfg , serverCfgs []ConsumerGroupCfg , topics Topics , log * slog.Logger ) (* SourcePool , error ) {
87
+ func NewSourcePool (cfg SourcePoolCfg , serverCfgs []ConsumerGroupCfg , topics Topics , m * metrics. Set , log * slog.Logger ) (* SourcePool , error ) {
86
88
servers := make ([]Server , 0 , len (serverCfgs ))
87
89
88
90
// Initially mark all servers as unhealthy.
@@ -105,6 +107,7 @@ func NewSourcePool(cfg SourcePoolCfg, serverCfgs []ConsumerGroupCfg, topics Topi
105
107
topics : topicNames ,
106
108
servers : servers ,
107
109
log : log ,
110
+ metrics : m ,
108
111
backoffFn : getBackoffFn (cfg .EnableBackoff , cfg .BackoffMin , cfg .BackoffMax ),
109
112
}, nil
110
113
}
@@ -154,6 +157,7 @@ loop:
154
157
conn , err := sp .newConn (globalCtx , s )
155
158
if err != nil {
156
159
retries ++
160
+ sp .metrics .GetOrCreateCounter (fmt .Sprintf (SrcNetworkErrMetric , s .ID , "new connection failed" )).Inc ()
157
161
sp .log .Error ("new source connection failed" , "id" , s .ID , "broker" , s .Config .BootstrapBrokers , "error" , err , "retries" , retries )
158
162
waitTries (globalCtx , sp .backoffFn (retries ))
159
163
continue loop
@@ -170,6 +174,7 @@ loop:
170
174
}
171
175
172
176
retries ++
177
+ sp .metrics .GetOrCreateCounter (SrcsUnhealthyMetric ).Inc ()
173
178
sp .log .Error ("no healthy server found. waiting and retrying" , "retries" , retries )
174
179
waitTries (globalCtx , sp .backoffFn (retries ))
175
180
}
@@ -183,6 +188,7 @@ func (sp *SourcePool) GetFetches(s *Server) (kgo.Fetches, error) {
183
188
184
189
// There's no connection.
185
190
if fetches .IsClientClosed () {
191
+ sp .metrics .GetOrCreateCounter (fmt .Sprintf (SrcKafkaErrMetric , s .ID , "client closed" )).Inc ()
186
192
sp .log .Debug ("retrieving fetches failed. client closed." , "id" , s .ID , "broker" , s .Config .BootstrapBrokers )
187
193
sp .setWeight (s .ID , unhealthyWeight )
188
194
@@ -191,6 +197,7 @@ func (sp *SourcePool) GetFetches(s *Server) (kgo.Fetches, error) {
191
197
192
198
// If there are errors in the fetches, handle them.
193
199
for _ , err := range fetches .Errors () {
200
+ sp .metrics .GetOrCreateCounter (fmt .Sprintf (SrcKafkaErrMetric , s .ID , "fetches error" )).Inc ()
194
201
sp .log .Error ("found error in fetches" , "server" , s .ID , "error" , err .Err )
195
202
sp .setWeight (s .ID , unhealthyWeight )
196
203
@@ -513,6 +520,7 @@ func (sp *SourcePool) setWeight(id int, weight int64) {
513
520
sp .curCandidate = s
514
521
}
515
522
523
+ sp .metrics .GetOrCreateCounter (fmt .Sprintf (SrcHealthMetric , id )).Set (uint64 (weight ))
516
524
sp .log .Debug ("setting candidate weight" , "id" , id , "weight" , weight , "curr" , sp .curCandidate )
517
525
sp .servers [id ] = s
518
526
break
0 commit comments