@@ -29,6 +29,41 @@ namespace {
29
29
30
30
// //////////////////////////////////////////////////////////////////////////////
31
31
32
+ struct TStorageProxyMetrics : public TThrRefBase {
33
+ explicit TStorageProxyMetrics (const ::NMonitoring::TDynamicCounterPtr& counters)
34
+ : Counters(counters)
35
+ , Errors(Counters->GetCounter (" Errors" , true ))
36
+ , Inflight(Counters->GetCounter (" Inflight" ))
37
+ , LatencyMs(Counters->GetHistogram (" LatencyMs" , ::NMonitoring::ExplicitHistogram({1 , 5 , 20 , 100 , 500 , 2000 , 10000 , 50000 })))
38
+ {}
39
+
40
+ ::NMonitoring::TDynamicCounterPtr Counters;
41
+ ::NMonitoring::TDynamicCounters::TCounterPtr Errors;
42
+ ::NMonitoring::TDynamicCounters::TCounterPtr Inflight;
43
+ ::NMonitoring::THistogramPtr LatencyMs;
44
+ };
45
+
46
+ using TStorageProxyMetricsPtr = TIntrusivePtr<TStorageProxyMetrics>;
47
+
48
+ struct TRequestContext : public TThrRefBase {
49
+ TInstant StartTime = TInstant::Now();
50
+ const TStorageProxyMetricsPtr Metrics;
51
+
52
+ TRequestContext (const TStorageProxyMetricsPtr& metrics)
53
+ : Metrics(metrics) {
54
+ Metrics->Inflight ->Inc ();
55
+ }
56
+
57
+ ~TRequestContext () {
58
+ Metrics->Inflight ->Dec ();
59
+ Metrics->LatencyMs ->Collect ((TInstant::Now () - StartTime).MilliSeconds ());
60
+ }
61
+
62
+ void IncError () {
63
+ Metrics->Errors ->Inc ();
64
+ }
65
+ };
66
+
32
67
class TStorageProxy : public TActorBootstrapped <TStorageProxy> {
33
68
NConfig::TCheckpointCoordinatorConfig Config;
34
69
NConfig::TCommonConfig CommonConfig;
@@ -38,13 +73,15 @@ class TStorageProxy : public TActorBootstrapped<TStorageProxy> {
38
73
TActorId ActorGC;
39
74
NKikimr::TYdbCredentialsProviderFactory CredentialsProviderFactory;
40
75
TYqSharedResources::TPtr YqSharedResources;
76
+ const TStorageProxyMetricsPtr Metrics;
41
77
42
78
public:
43
79
explicit TStorageProxy (
44
80
const NConfig::TCheckpointCoordinatorConfig& config,
45
81
const NConfig::TCommonConfig& commonConfig,
46
82
const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory,
47
- const TYqSharedResources::TPtr& yqSharedResources);
83
+ const TYqSharedResources::TPtr& yqSharedResources,
84
+ const ::NMonitoring::TDynamicCounterPtr& counters);
48
85
49
86
void Bootstrap ();
50
87
@@ -103,12 +140,14 @@ TStorageProxy::TStorageProxy(
103
140
const NConfig::TCheckpointCoordinatorConfig& config,
104
141
const NConfig::TCommonConfig& commonConfig,
105
142
const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory,
106
- const TYqSharedResources::TPtr& yqSharedResources)
143
+ const TYqSharedResources::TPtr& yqSharedResources,
144
+ const ::NMonitoring::TDynamicCounterPtr& counters)
107
145
: Config(config)
108
146
, CommonConfig(commonConfig)
109
147
, StorageConfig(Config.GetStorage())
110
148
, CredentialsProviderFactory(credentialsProviderFactory)
111
- , YqSharedResources(yqSharedResources) {
149
+ , YqSharedResources(yqSharedResources)
150
+ , Metrics(MakeIntrusive<TStorageProxyMetrics>(counters)) {
112
151
FillDefaultParameters (Config, StorageConfig);
113
152
}
114
153
@@ -138,17 +177,21 @@ void TStorageProxy::Bootstrap() {
138
177
}
139
178
140
179
void TStorageProxy::Handle (TEvCheckpointStorage::TEvRegisterCoordinatorRequest::TPtr& ev) {
180
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
181
+
141
182
const auto * event = ev->Get ();
142
183
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->CoordinatorId << " ] Got TEvRegisterCoordinatorRequest" )
143
184
144
185
CheckpointStorage->RegisterGraphCoordinator (event->CoordinatorId )
145
186
.Apply ([coordinatorId = event->CoordinatorId ,
146
187
cookie = ev->Cookie ,
147
188
sender = ev->Sender ,
148
- actorSystem = TActivationContext::ActorSystem ()] (const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
189
+ actorSystem = TActivationContext::ActorSystem (),
190
+ context] (const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
149
191
auto response = std::make_unique<TEvCheckpointStorage::TEvRegisterCoordinatorResponse>();
150
192
response->Issues = issuesFuture.GetValue ();
151
193
if (response->Issues ) {
194
+ context->IncError ();
152
195
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << coordinatorId << " ] Failed to register graph: " << response->Issues .ToString ())
153
196
} else {
154
197
LOG_STREAMS_STORAGE_SERVICE_AS_INFO (*actorSystem, " [" << coordinatorId << " ] Graph registered" )
@@ -159,6 +202,7 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvRegisterCoordinatorRequest::
159
202
}
160
203
161
204
void TStorageProxy::Handle (TEvCheckpointStorage::TEvCreateCheckpointRequest::TPtr& ev) {
205
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
162
206
const auto * event = ev->Get ();
163
207
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->CoordinatorId << " ] [" << event->CheckpointId << " ] Got TEvCreateCheckpointRequest" )
164
208
@@ -169,7 +213,8 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCreateCheckpointRequest::TPt
169
213
sender = ev->Sender ,
170
214
totalGraphCheckpointsSizeLimit = Config.GetStateStorageLimits ().GetMaxGraphCheckpointsSizeBytes (),
171
215
graphDesc = std::move (event->GraphDescription ),
172
- storage = CheckpointStorage]
216
+ storage = CheckpointStorage,
217
+ context]
173
218
(const NThreading::TFuture<ICheckpointStorage::TGetTotalCheckpointsStateSizeResult>& resultFuture) {
174
219
auto [totalGraphCheckpointsSize, issues] = resultFuture.GetValue ();
175
220
@@ -179,6 +224,7 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCreateCheckpointRequest::TPt
179
224
issues.AddIssue (std::move (ss.Str ()));
180
225
}
181
226
if (issues) {
227
+ context->IncError ();
182
228
return NThreading::MakeFuture (ICheckpointStorage::TCreateCheckpointResult {TString (), std::move (issues) } );
183
229
}
184
230
if (std::holds_alternative<TString>(graphDesc)) {
@@ -191,11 +237,13 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCreateCheckpointRequest::TPt
191
237
coordinatorId = event->CoordinatorId ,
192
238
cookie = ev->Cookie ,
193
239
sender = ev->Sender ,
194
- actorSystem = TActivationContext::ActorSystem ()]
240
+ actorSystem = TActivationContext::ActorSystem (),
241
+ context]
195
242
(const NThreading::TFuture<ICheckpointStorage::TCreateCheckpointResult>& resultFuture) {
196
243
auto [graphDescId, issues] = resultFuture.GetValue ();
197
244
auto response = std::make_unique<TEvCheckpointStorage::TEvCreateCheckpointResponse>(checkpointId, std::move (issues), std::move (graphDescId));
198
245
if (response->Issues ) {
246
+ context->IncError ();
199
247
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Failed to create checkpoint: " << response->Issues .ToString ());
200
248
} else {
201
249
LOG_STREAMS_STORAGE_SERVICE_AS_INFO (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Checkpoint created" );
@@ -206,18 +254,21 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCreateCheckpointRequest::TPt
206
254
}
207
255
208
256
void TStorageProxy::Handle (TEvCheckpointStorage::TEvSetCheckpointPendingCommitStatusRequest::TPtr& ev) {
257
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
209
258
const auto * event = ev->Get ();
210
259
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->CoordinatorId << " ] [" << event->CheckpointId << " ] Got TEvSetCheckpointPendingCommitStatusRequest" )
211
260
CheckpointStorage->UpdateCheckpointStatus (event->CoordinatorId , event->CheckpointId , ECheckpointStatus::PendingCommit, ECheckpointStatus::Pending, event->StateSizeBytes )
212
261
.Apply ([checkpointId = event->CheckpointId ,
213
262
coordinatorId = event->CoordinatorId ,
214
263
cookie = ev->Cookie ,
215
264
sender = ev->Sender ,
216
- actorSystem = TActivationContext::ActorSystem ()]
265
+ actorSystem = TActivationContext::ActorSystem (),
266
+ context]
217
267
(const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
218
268
auto issues = issuesFuture.GetValue ();
219
269
auto response = std::make_unique<TEvCheckpointStorage::TEvSetCheckpointPendingCommitStatusResponse>(checkpointId, std::move (issues));
220
270
if (response->Issues ) {
271
+ context->IncError ();
221
272
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Failed to set 'PendingCommit' status: " << response->Issues .ToString ())
222
273
} else {
223
274
LOG_STREAMS_STORAGE_SERVICE_AS_INFO (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Status updated to 'PendingCommit'" )
@@ -228,6 +279,7 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvSetCheckpointPendingCommitSt
228
279
}
229
280
230
281
void TStorageProxy::Handle (TEvCheckpointStorage::TEvCompleteCheckpointRequest::TPtr& ev) {
282
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
231
283
const auto * event = ev->Get ();
232
284
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->CoordinatorId << " ] [" << event->CheckpointId << " ] Got TEvCompleteCheckpointRequest" )
233
285
CheckpointStorage->UpdateCheckpointStatus (event->CoordinatorId , event->CheckpointId , ECheckpointStatus::Completed, ECheckpointStatus::PendingCommit, event->StateSizeBytes )
@@ -238,11 +290,13 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCompleteCheckpointRequest::T
238
290
type = event->Type ,
239
291
gcEnabled = Config.GetCheckpointGarbageConfig ().GetEnabled (),
240
292
actorGC = ActorGC,
241
- actorSystem = TActivationContext::ActorSystem ()]
293
+ actorSystem = TActivationContext::ActorSystem (),
294
+ context]
242
295
(const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
243
296
auto issues = issuesFuture.GetValue ();
244
297
auto response = std::make_unique<TEvCheckpointStorage::TEvCompleteCheckpointResponse>(checkpointId, std::move (issues));
245
298
if (response->Issues ) {
299
+ context->IncError ();
246
300
LOG_STREAMS_STORAGE_SERVICE_AS_DEBUG (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Failed to set 'Completed' status: " << response->Issues .ToString ())
247
301
} else {
248
302
LOG_STREAMS_STORAGE_SERVICE_AS_INFO (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Status updated to 'Completed'" )
@@ -258,17 +312,20 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvCompleteCheckpointRequest::T
258
312
}
259
313
260
314
void TStorageProxy::Handle (TEvCheckpointStorage::TEvAbortCheckpointRequest::TPtr& ev) {
315
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
261
316
const auto * event = ev->Get ();
262
317
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->CoordinatorId << " ] [" << event->CheckpointId << " ] Got TEvAbortCheckpointRequest" )
263
318
CheckpointStorage->AbortCheckpoint (event->CoordinatorId ,event->CheckpointId )
264
319
.Apply ([checkpointId = event->CheckpointId ,
265
320
coordinatorId = event->CoordinatorId ,
266
321
cookie = ev->Cookie ,
267
322
sender = ev->Sender ,
268
- actorSystem = TActivationContext::ActorSystem ()] (const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
323
+ actorSystem = TActivationContext::ActorSystem (),
324
+ context] (const NThreading::TFuture<NYql::TIssues>& issuesFuture) {
269
325
auto issues = issuesFuture.GetValue ();
270
326
auto response = std::make_unique<TEvCheckpointStorage::TEvAbortCheckpointResponse>(checkpointId, std::move (issues));
271
327
if (response->Issues ) {
328
+ context->IncError ();
272
329
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Failed to abort checkpoint: " << response->Issues .ToString ())
273
330
} else {
274
331
LOG_STREAMS_STORAGE_SERVICE_AS_INFO (*actorSystem, " [" << coordinatorId << " ] [" << checkpointId << " ] Checkpoint aborted" )
@@ -279,16 +336,19 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvAbortCheckpointRequest::TPtr
279
336
}
280
337
281
338
void TStorageProxy::Handle (TEvCheckpointStorage::TEvGetCheckpointsMetadataRequest::TPtr& ev) {
339
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
282
340
const auto * event = ev->Get ();
283
341
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->GraphId << " ] Got TEvGetCheckpointsMetadataRequest" );
284
342
CheckpointStorage->GetCheckpoints (event->GraphId , event->Statuses , event->Limit , event->LoadGraphDescription )
285
343
.Apply ([graphId = event->GraphId ,
286
344
cookie = ev->Cookie ,
287
345
sender = ev->Sender ,
288
- actorSystem = TActivationContext::ActorSystem ()] (const NThreading::TFuture<ICheckpointStorage::TGetCheckpointsResult>& futureResult) {
346
+ actorSystem = TActivationContext::ActorSystem (),
347
+ context] (const NThreading::TFuture<ICheckpointStorage::TGetCheckpointsResult>& futureResult) {
289
348
auto result = futureResult.GetValue ();
290
349
auto response = std::make_unique<TEvCheckpointStorage::TEvGetCheckpointsMetadataResponse>(result.first , result.second );
291
350
if (response->Issues ) {
351
+ context->IncError ();
292
352
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << graphId << " ] Failed to get checkpoints: " << response->Issues .ToString ())
293
353
}
294
354
LOG_STREAMS_STORAGE_SERVICE_AS_DEBUG (*actorSystem, " [" << graphId << " ] Send TEvGetCheckpointsMetadataResponse" )
@@ -297,6 +357,7 @@ void TStorageProxy::Handle(TEvCheckpointStorage::TEvGetCheckpointsMetadataReques
297
357
}
298
358
299
359
void TStorageProxy::Handle (NYql::NDq::TEvDqCompute::TEvSaveTaskState::TPtr& ev) {
360
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
300
361
auto * event = ev->Get ();
301
362
const auto checkpointId = TCheckpointId (event->Checkpoint .GetGeneration (), event->Checkpoint .GetId ());
302
363
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->GraphId << " ] [" << checkpointId << " ] Got TEvSaveTaskState: task " << event->TaskId );
@@ -321,7 +382,8 @@ void TStorageProxy::Handle(NYql::NDq::TEvDqCompute::TEvSaveTaskState::TPtr& ev)
321
382
taskId = event->TaskId ,
322
383
cookie = ev->Cookie ,
323
384
sender = ev->Sender ,
324
- actorSystem = TActivationContext::ActorSystem ()](const NThreading::TFuture<IStateStorage::TSaveStateResult>& futureResult) {
385
+ actorSystem = TActivationContext::ActorSystem (),
386
+ context](const NThreading::TFuture<IStateStorage::TSaveStateResult>& futureResult) {
325
387
LOG_STREAMS_STORAGE_SERVICE_AS_DEBUG (*actorSystem, " [" << graphId << " ] [" << checkpointId << " ] TEvSaveTaskState Apply: task: " << taskId)
326
388
const auto & issues = futureResult.GetValue ().second ;
327
389
auto response = std::make_unique<NYql::NDq::TEvDqCompute::TEvSaveTaskStateResult>();
@@ -331,6 +393,7 @@ void TStorageProxy::Handle(NYql::NDq::TEvDqCompute::TEvSaveTaskState::TPtr& ev)
331
393
response->Record .SetTaskId (taskId);
332
394
333
395
if (issues) {
396
+ context->IncError ();
334
397
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << graphId << " ] [" << checkpointId << " ] Failed to save task state: task: " << taskId << " , issues: " << issues.ToString ())
335
398
response->Record .SetStatus (NYql::NDqProto::TEvSaveTaskStateResult::STORAGE_ERROR);
336
399
} else {
@@ -342,6 +405,7 @@ void TStorageProxy::Handle(NYql::NDq::TEvDqCompute::TEvSaveTaskState::TPtr& ev)
342
405
}
343
406
344
407
void TStorageProxy::Handle (NYql::NDq::TEvDqCompute::TEvGetTaskState::TPtr& ev) {
408
+ auto context = MakeIntrusive<TRequestContext>(Metrics);
345
409
const auto * event = ev->Get ();
346
410
const auto checkpointId = TCheckpointId (event->Checkpoint .GetGeneration (), event->Checkpoint .GetId ());
347
411
LOG_STREAMS_STORAGE_SERVICE_DEBUG (" [" << event->GraphId << " ] [" << checkpointId << " ] Got TEvGetTaskState: tasks {" << JoinSeq (" , " , event->TaskIds ) << " }" );
@@ -353,12 +417,14 @@ void TStorageProxy::Handle(NYql::NDq::TEvDqCompute::TEvGetTaskState::TPtr& ev) {
353
417
taskIds = event->TaskIds ,
354
418
cookie = ev->Cookie ,
355
419
sender = ev->Sender ,
356
- actorSystem = TActivationContext::ActorSystem ()](const NThreading::TFuture<IStateStorage::TGetStateResult>& resultFuture) {
420
+ actorSystem = TActivationContext::ActorSystem (),
421
+ context](const NThreading::TFuture<IStateStorage::TGetStateResult>& resultFuture) {
357
422
auto result = resultFuture.GetValue ();
358
423
359
424
auto response = std::make_unique<NYql::NDq::TEvDqCompute::TEvGetTaskStateResult>(checkpointId, result.second , generation);
360
425
std::swap (response->States , result.first );
361
426
if (response->Issues ) {
427
+ context->IncError ();
362
428
LOG_STREAMS_STORAGE_SERVICE_AS_WARN (*actorSystem, " [" << graphId << " ] [" << checkpointId << " ] Failed to get task state: tasks: {" << JoinSeq (" , " , taskIds) << " }, issues: " << response->Issues .ToString ());
363
429
}
364
430
LOG_STREAMS_STORAGE_SERVICE_AS_DEBUG (*actorSystem, " [" << graphId << " ] [" << checkpointId << " ] Send TEvGetTaskStateResult: tasks: {" << JoinSeq (" , " , taskIds) << " }" );
@@ -374,9 +440,10 @@ std::unique_ptr<NActors::IActor> NewStorageProxy(
374
440
const NConfig::TCheckpointCoordinatorConfig& config,
375
441
const NConfig::TCommonConfig& commonConfig,
376
442
const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory,
377
- const TYqSharedResources::TPtr& yqSharedResources)
443
+ const TYqSharedResources::TPtr& yqSharedResources,
444
+ const ::NMonitoring::TDynamicCounterPtr& counters)
378
445
{
379
- return std::unique_ptr<NActors::IActor>(new TStorageProxy (config, commonConfig, credentialsProviderFactory, yqSharedResources));
446
+ return std::unique_ptr<NActors::IActor>(new TStorageProxy (config, commonConfig, credentialsProviderFactory, yqSharedResources, counters ));
380
447
}
381
448
382
449
} // namespace NFq
0 commit comments