@@ -8,11 +8,13 @@ import (
8
8
9
9
"github.com/google/uuid"
10
10
"github.com/rs/zerolog/log"
11
+ "go.opentelemetry.io/otel/attribute"
11
12
12
13
"github.com/bacalhau-project/bacalhau/pkg/jobstore"
13
14
"github.com/bacalhau-project/bacalhau/pkg/lib/envelope"
14
15
"github.com/bacalhau-project/bacalhau/pkg/models"
15
16
"github.com/bacalhau-project/bacalhau/pkg/models/messages"
17
+ "github.com/bacalhau-project/bacalhau/pkg/telemetry"
16
18
)
17
19
18
20
// MessageHandler base implementation of requester Endpoint
@@ -35,27 +37,38 @@ func (m *MessageHandler) ShouldProcess(ctx context.Context, message *envelope.Me
35
37
36
38
// HandleMessage handles incoming messages
37
39
// TODO: handle messages arriving out of order gracefully
38
- func (m * MessageHandler ) HandleMessage (ctx context.Context , message * envelope.Message ) error {
39
- var err error
40
+ func (m * MessageHandler ) HandleMessage (ctx context.Context , message * envelope.Message ) (err error ) {
41
+ metrics := telemetry .NewMetricRecorder (
42
+ attribute .String (AttrMessageType , message .Metadata .Get (envelope .KeyMessageType )),
43
+ attribute .String (AttrOutcomeKey , AttrOutcomeSuccess ),
44
+ )
45
+ defer func () {
46
+ metrics .Count (ctx , messageHandlerProcessCount )
47
+ metrics .Done (ctx , messageHandlerProcessDuration )
48
+ }()
49
+
40
50
switch message .Metadata .Get (envelope .KeyMessageType ) {
41
51
case messages .BidResultMessageType :
42
- err = m .OnBidComplete (ctx , message )
52
+ err = m .OnBidComplete (ctx , metrics , message )
43
53
case messages .RunResultMessageType :
44
- err = m .OnRunComplete (ctx , message )
54
+ err = m .OnRunComplete (ctx , metrics , message )
45
55
case messages .ComputeErrorMessageType :
46
- err = m .OnComputeFailure (ctx , message )
56
+ err = m .OnComputeFailure (ctx , metrics , message )
47
57
}
48
58
49
- return m .handleError (ctx , message , err )
59
+ return m .handleError (ctx , metrics , message , err )
50
60
}
51
61
52
62
// handleError logs the error with context and returns nil.
53
63
// In the future, this can be extended to handle different error types differently.
54
- func (m * MessageHandler ) handleError (ctx context.Context , message * envelope.Message , err error ) error {
64
+ func (m * MessageHandler ) handleError (ctx context.Context , metrics * telemetry. MetricRecorder , message * envelope.Message , err error ) error {
55
65
if err == nil {
56
66
return nil
57
67
}
58
68
69
+ metrics .Error (err )
70
+ metrics .AddAttributes (attribute .String (AttrOutcomeKey , AttrOutcomeFailure ))
71
+
59
72
// For now, just log the error and return nil
60
73
logger := log .Ctx (ctx ).Error ()
61
74
for key , value := range message .Metadata .ToMap () {
@@ -66,7 +79,7 @@ func (m *MessageHandler) handleError(ctx context.Context, message *envelope.Mess
66
79
}
67
80
68
81
// OnBidComplete handles the completion of a bid request
69
- func (m * MessageHandler ) OnBidComplete (ctx context.Context , message * envelope.Message ) error {
82
+ func (m * MessageHandler ) OnBidComplete (ctx context.Context , metrics * telemetry. MetricRecorder , message * envelope.Message ) error {
70
83
result , ok := message .Payload .(* messages.BidResult )
71
84
if ! ok {
72
85
return envelope .NewErrUnexpectedPayloadType ("BidResult" , reflect .TypeOf (message .Payload ).String ())
@@ -92,6 +105,7 @@ func (m *MessageHandler) OnBidComplete(ctx context.Context, message *envelope.Me
92
105
}
93
106
94
107
txContext , err := m .store .BeginTx (ctx )
108
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartBeginTx )
95
109
if err != nil {
96
110
return fmt .Errorf ("failed to begin transaction: %w" , err )
97
111
}
@@ -101,30 +115,36 @@ func (m *MessageHandler) OnBidComplete(ctx context.Context, message *envelope.Me
101
115
if err = m .store .UpdateExecution (txContext , updateRequest ); err != nil {
102
116
return err
103
117
}
118
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartUpdateExec )
104
119
105
120
// enqueue evaluation to allow the scheduler to either accept the bid, or find a new node
106
121
err = m .enqueueEvaluation (txContext , result .JobID , result .JobType )
107
122
if err != nil {
108
123
return err
109
124
}
125
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCreateEval )
110
126
111
- return txContext .Commit ()
127
+ err = txContext .Commit ()
128
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCommitTx )
129
+ return err
112
130
}
113
131
114
- func (m * MessageHandler ) OnRunComplete (ctx context.Context , message * envelope.Message ) error {
132
+ func (m * MessageHandler ) OnRunComplete (ctx context.Context , metrics * telemetry. MetricRecorder , message * envelope.Message ) error {
115
133
result , ok := message .Payload .(* messages.RunResult )
116
134
if ! ok {
117
135
return envelope .NewErrUnexpectedPayloadType ("RunResult" , reflect .TypeOf (message .Payload ).String ())
118
136
}
119
137
120
138
txContext , err := m .store .BeginTx (ctx )
139
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartBeginTx )
121
140
if err != nil {
122
141
return fmt .Errorf ("failed to begin transaction: %w" , err )
123
142
}
124
143
125
144
defer txContext .Rollback () //nolint:errcheck
126
145
127
146
job , err := m .store .GetJob (txContext , result .JobID )
147
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartGetJob )
128
148
if err != nil {
129
149
return err
130
150
}
@@ -158,22 +178,27 @@ func (m *MessageHandler) OnRunComplete(ctx context.Context, message *envelope.Me
158
178
if err = m .store .UpdateExecution (txContext , updateRequest ); err != nil {
159
179
return err
160
180
}
181
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartUpdateExec )
161
182
162
183
// enqueue evaluation to allow the scheduler to mark the job as completed if all executions are completed
163
184
if err = m .enqueueEvaluation (txContext , result .JobID , result .JobType ); err != nil {
164
185
return err
165
186
}
187
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCreateEval )
166
188
167
- return txContext .Commit ()
189
+ err = txContext .Commit ()
190
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCommitTx )
191
+ return err
168
192
}
169
193
170
- func (m * MessageHandler ) OnComputeFailure (ctx context.Context , message * envelope.Message ) error {
194
+ func (m * MessageHandler ) OnComputeFailure (ctx context.Context , metrics * telemetry. MetricRecorder , message * envelope.Message ) error {
171
195
result , ok := message .Payload .(* messages.ComputeError )
172
196
if ! ok {
173
197
return envelope .NewErrUnexpectedPayloadType ("ComputeError" , reflect .TypeOf (message .Payload ).String ())
174
198
}
175
199
176
200
txContext , err := m .store .BeginTx (ctx )
201
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartBeginTx )
177
202
if err != nil {
178
203
return fmt .Errorf ("failed to begin transaction: %w" , err )
179
204
}
@@ -197,13 +222,17 @@ func (m *MessageHandler) OnComputeFailure(ctx context.Context, message *envelope
197
222
}); err != nil {
198
223
return err
199
224
}
225
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartUpdateExec )
200
226
201
227
// enqueue evaluation to allow the scheduler find other nodes, or mark the job as failed
202
228
if err = m .enqueueEvaluation (txContext , result .JobID , result .JobType ); err != nil {
203
229
return err
204
230
}
231
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCreateEval )
205
232
206
- return txContext .Commit ()
233
+ err = txContext .Commit ()
234
+ metrics .Latency (ctx , messageHandlerProcessPartDuration , AttrPartCommitTx )
235
+ return err
207
236
}
208
237
209
238
// enqueueEvaluation enqueues an evaluation to allow the scheduler to either accept the bid, or find a new node
0 commit comments