Skip to content

Commit e528785

Browse files
authored
refactor(tracking): replace flat operationType with dimensional attributes (#623)
1 parent 7df4703 commit e528785

5 files changed

Lines changed: 174 additions & 116 deletions

File tree

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,43 @@
11
package com.linkedin.metadata.dao.tracking;
22

33
import javax.annotation.Nonnull;
4+
import javax.annotation.Nullable;
45

56

67
/**
7-
* Interface for recording per-DAO-operation latency and error metrics.
8+
* Interface for recording per-DAO-operation latency and count metrics with dimensional attributes.
89
*
9-
* <p>Implementations collect histograms (latency) and counters (operation count, error count)
10-
* to benchmark DAO performance during the MySQL to TiDB migration evaluation.</p>
10+
* <p>Each call records both a count (one increment) and a latency observation. Implementations
11+
* are expected to emit one metric per dimension instead of baking dimensions into metric names,
12+
* so consumers can aggregate or filter on individual attributes without enumerating every
13+
* concrete metric series.
1114
*
12-
* <p>Follows the same pattern as {@link BaseTrackingManager} / {@link DummyTrackingManager}:
13-
* a no-op implementation ({@code NoOpDaoBenchmarkMetrics}) lives in the kernel (datahub-gma)
14-
* and the real Dropwizard-backed implementation lives in the service layer.</p>
15+
* <p>A no-op implementation ({@link NoOpDaoBenchmarkMetrics}) lives in the kernel; concrete
16+
* backends (OTEL, Dropwizard, etc.) live in the service layer.
1517
*/
1618
public interface BaseDaoBenchmarkMetrics {
1719

1820
/**
19-
* Record the latency of a successful or failed DAO operation.
21+
* Record a completed DAO operation with its dimensional attributes.
2022
*
21-
* @param operationType the DAO operation name (e.g. "add", "batchGetUnion", "list")
22-
* @param entityType the entity type derived from the URN class (e.g. "dataset", "corpuser")
23-
* @param latencyMs wall-clock latency in milliseconds
23+
* @param operation pure operation name with no concatenation (e.g. {@code "add"},
24+
* {@code "batchGetUnion"})
25+
* @param entityType entity type derived from the URN class (e.g. {@code "dataset"})
26+
* @param aspect aspect class simple name for per-aspect operations, or {@code null} when
27+
* the operation is not per-aspect
28+
* @param countBucket pre-bucketed count label ({@code "1"} through {@code "9"}, {@code "10+"})
29+
* for batch operations, or {@code null} when there is no count dimension
30+
* @param status outcome string, e.g. {@code "success"} or {@code "failure"}
31+
* @param errorClass simple name of the thrown exception on failure, or {@code null} on success
32+
* @param latencyMs wall-clock latency of the operation in milliseconds
2433
*/
25-
void recordOperationLatency(@Nonnull String operationType, @Nonnull String entityType, long latencyMs);
34+
void recordOperation(@Nonnull String operation, @Nonnull String entityType,
35+
@Nullable String aspect, @Nullable String countBucket, @Nonnull String status,
36+
@Nullable String errorClass, long latencyMs);
2637

2738
/**
28-
* Record an error that occurred during a DAO operation.
29-
*
30-
* @param operationType the DAO operation name (e.g. "add", "create")
31-
* @param entityType the entity type derived from the URN class
32-
* @param exceptionClass the simple class name of the thrown exception (e.g. "SQLException")
33-
*/
34-
void recordOperationError(@Nonnull String operationType, @Nonnull String entityType,
35-
@Nonnull String exceptionClass);
36-
37-
/**
38-
* Whether metrics collection is enabled. Callers may short-circuit expensive
39-
* instrumentation when this returns {@code false}.
40-
*
41-
* @return true if metrics are being collected
39+
* Whether metrics collection is enabled. Callers may short-circuit instrumentation when
40+
* this returns {@code false}.
4241
*/
4342
boolean isEnabled();
4443
}

dao-api/src/main/java/com/linkedin/metadata/dao/tracking/NoOpDaoBenchmarkMetrics.java

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,20 @@
11
package com.linkedin.metadata.dao.tracking;
22

33
import javax.annotation.Nonnull;
4+
import javax.annotation.Nullable;
45

56

67
/**
7-
* A no-op implementation of {@link BaseDaoBenchmarkMetrics} that discards all metrics.
8+
* A no-op implementation of {@link BaseDaoBenchmarkMetrics} that discards all observations.
89
*
910
* <p>Used as the default when no real metrics backend is configured.
10-
* Follows the same pattern as {@link DummyTrackingManager}.</p>
1111
*/
1212
public class NoOpDaoBenchmarkMetrics implements BaseDaoBenchmarkMetrics {
1313

1414
@Override
15-
public void recordOperationLatency(@Nonnull String operationType, @Nonnull String entityType, long latencyMs) {
16-
// Do nothing
17-
}
18-
19-
@Override
20-
public void recordOperationError(@Nonnull String operationType, @Nonnull String entityType,
21-
@Nonnull String exceptionClass) {
15+
public void recordOperation(@Nonnull String operation, @Nonnull String entityType,
16+
@Nullable String aspect, @Nullable String countBucket, @Nonnull String status,
17+
@Nullable String errorClass, long latencyMs) {
2218
// Do nothing
2319
}
2420

dao-api/src/test/java/com/linkedin/metadata/dao/tracking/NoOpDaoBenchmarkMetricsTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ public class NoOpDaoBenchmarkMetricsTest {
1111
public void testNoOpBehavior() {
1212
NoOpDaoBenchmarkMetrics metrics = new NoOpDaoBenchmarkMetrics();
1313

14-
// Should not throw
15-
metrics.recordOperationLatency("add", "dataset", 42L);
16-
metrics.recordOperationError("add", "dataset", "SQLException");
14+
// Should not throw on any dimension combination
15+
metrics.recordOperation("add", "dataset", "AspectFoo", null, "success", null, 42L);
16+
metrics.recordOperation("batchUpsert", "corpuser", null, "3", "failure", "SQLException", 15L);
1717

1818
assertFalse(metrics.isEnabled());
1919
}

dao-impl/ebean-dao/src/main/java/com/linkedin/metadata/dao/InstrumentedEbeanLocalAccess.java

Lines changed: 74 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.linkedin.metadata.dao;
22

3+
import com.google.common.annotations.VisibleForTesting;
34
import com.linkedin.common.AuditStamp;
45
import com.linkedin.common.urn.Urn;
56
import com.linkedin.data.template.RecordTemplate;
@@ -19,30 +20,43 @@
1920

2021

2122
/**
22-
* A decorator around {@link IEbeanLocalAccess} that records per-operation latency and error metrics
23-
* via {@link BaseDaoBenchmarkMetrics}.
23+
* A decorator around {@link IEbeanLocalAccess} that records per-operation latency and count
24+
* metrics via {@link BaseDaoBenchmarkMetrics}, passing dimensions (operation, aspect, count
25+
* bucket, status, error class) as separate attributes rather than baking them into a single
26+
* metric name.
2427
*
2528
* <p>Delegates every call to the wrapped implementation, measuring wall-clock time with
26-
* {@link System#nanoTime()}. On success, records latency; on error, records both latency and
27-
* the exception class, then re-throws.</p>
29+
* {@link System#nanoTime()}. On both success and failure, calls
30+
* {@link BaseDaoBenchmarkMetrics#recordOperation} once with the observed latency, status, and
31+
* (on failure) the thrown exception's simple class name; the exception is re-thrown unchanged.
2832
*
2933
* <p>When {@link BaseDaoBenchmarkMetrics#isEnabled()} returns {@code false}, delegation is
30-
* direct with zero overhead (no timing).</p>
34+
* direct with zero overhead (no timing).
3135
*
3236
* @param <URN> the URN type for this entity
3337
*/
3438
public class InstrumentedEbeanLocalAccess<URN extends Urn> implements IEbeanLocalAccess<URN> {
3539

40+
static final String STATUS_SUCCESS = "success";
41+
static final String STATUS_FAILURE = "failure";
42+
43+
// 1..9 mapped to interned strings; 0 and 10+ handled separately. Avoids
44+
// Integer.toString() allocation on the hot path.
45+
private static final String[] COUNT_BUCKET_INTERNED =
46+
{"1", "2", "3", "4", "5", "6", "7", "8", "9"};
47+
private static final String BUCKET_ZERO = "0";
48+
private static final String BUCKET_OVERFLOW = "10+";
49+
3650
private final IEbeanLocalAccess<URN> _delegate;
3751
private final BaseDaoBenchmarkMetrics _metrics;
3852
private final String _entityType;
3953

4054
/**
4155
* Creates an instrumented wrapper around the given local access implementation.
4256
*
43-
* @param delegate the real local access implementation to wrap
44-
* @param metrics the metrics recorder (may be a no-op)
45-
* @param urnClass the URN class, used to derive the entity type name once at construction
57+
* @param delegate the real local access implementation to wrap
58+
* @param metrics the metrics recorder (may be a no-op)
59+
* @param urnClass the URN class, used to derive the entity type name once at construction
4660
*/
4761
public InstrumentedEbeanLocalAccess(@Nonnull IEbeanLocalAccess<URN> delegate,
4862
@Nonnull BaseDaoBenchmarkMetrics metrics, @Nonnull Class<URN> urnClass) {
@@ -51,6 +65,23 @@ public InstrumentedEbeanLocalAccess(@Nonnull IEbeanLocalAccess<URN> delegate,
5165
_entityType = urnClass.getSimpleName().replace("Urn", "").toLowerCase();
5266
}
5367

68+
/**
69+
* Bucket a count value into a small fixed set of labels to keep metric cardinality bounded.
70+
*
71+
* <p>Returns {@code "0"} for non-positive values (defensive; batch ops shouldn't hit this),
72+
* the interned digit string {@code "1".."9"} for 1-9, or {@code "10+"} for anything >= 10.
73+
*/
74+
@VisibleForTesting
75+
static String bucketCount(int n) {
76+
if (n <= 0) {
77+
return BUCKET_ZERO;
78+
}
79+
if (n >= 10) {
80+
return BUCKET_OVERFLOW;
81+
}
82+
return COUNT_BUCKET_INTERNED[n - 1];
83+
}
84+
5485
@Override
5586
public void setUrnPathExtractor(@Nonnull UrnPathExtractor<URN> urnPathExtractor) {
5687
_delegate.setUrnPathExtractor(urnPathExtractor);
@@ -66,16 +97,17 @@ public void configureOptionalForceIndex(@Nullable String indexName,
6697
public <ASPECT extends RecordTemplate> int add(@Nonnull URN urn, @Nullable ASPECT newValue,
6798
@Nonnull Class<ASPECT> aspectClass, @Nonnull AuditStamp auditStamp,
6899
@Nullable IngestionTrackingContext ingestionTrackingContext, boolean isTestMode) {
69-
return instrument("add." + aspectClass.getSimpleName(), () -> _delegate.add(urn, newValue,
70-
aspectClass, auditStamp, ingestionTrackingContext, isTestMode));
100+
return instrument("add", aspectClass.getSimpleName(), null,
101+
() -> _delegate.add(urn, newValue, aspectClass, auditStamp,
102+
ingestionTrackingContext, isTestMode));
71103
}
72104

73105
@Override
74106
public <ASPECT extends RecordTemplate> int addWithOptimisticLocking(@Nonnull URN urn,
75107
@Nullable ASPECT newValue, @Nonnull Class<ASPECT> aspectClass, @Nonnull AuditStamp auditStamp,
76108
@Nullable Timestamp oldTimestamp, @Nullable IngestionTrackingContext ingestionTrackingContext,
77109
boolean isTestMode, boolean softDeleteOverwrite) {
78-
return instrument("addWithOptimisticLocking." + aspectClass.getSimpleName(),
110+
return instrument("addWithOptimisticLocking", aspectClass.getSimpleName(), null,
79111
() -> _delegate.addWithOptimisticLocking(urn, newValue, aspectClass, auditStamp,
80112
oldTimestamp, ingestionTrackingContext, isTestMode, softDeleteOverwrite));
81113
}
@@ -86,7 +118,7 @@ public <ASPECT_UNION extends RecordTemplate> int create(@Nonnull URN urn,
86118
@Nonnull List<BaseLocalDAO.AspectCreateLambda<? extends RecordTemplate>> aspectCreateLambdas,
87119
@Nonnull AuditStamp auditStamp, @Nullable IngestionTrackingContext ingestionTrackingContext,
88120
boolean isTestMode) {
89-
return instrument("create.aspects_" + aspectValues.size(),
121+
return instrument("create", null, bucketCount(aspectValues.size()),
90122
() -> _delegate.create(urn, aspectValues, aspectCreateLambdas,
91123
auditStamp, ingestionTrackingContext, isTestMode));
92124
}
@@ -96,82 +128,87 @@ public <ASPECT_UNION extends RecordTemplate> int batchUpsert(@Nonnull URN urn,
96128
@Nonnull List<BaseLocalDAO.AspectUpdateContext<RecordTemplate>> updateContexts,
97129
@Nonnull AuditStamp auditStamp, @Nullable IngestionTrackingContext ingestionTrackingContext,
98130
boolean isTestMode) {
99-
return instrument("batchUpsert.aspects_" + updateContexts.size(),
100-
() -> _delegate.batchUpsert(urn, updateContexts, auditStamp, ingestionTrackingContext, isTestMode));
131+
return instrument("batchUpsert", null, bucketCount(updateContexts.size()),
132+
() -> _delegate.batchUpsert(urn, updateContexts, auditStamp,
133+
ingestionTrackingContext, isTestMode));
101134
}
102135

103136
@Nonnull
104137
@Override
105138
public <ASPECT extends RecordTemplate> List<EbeanMetadataAspect> batchGetUnion(
106139
@Nonnull List<AspectKey<URN, ? extends RecordTemplate>> keys, int keysCount, int position,
107140
boolean includeSoftDeleted, boolean isTestMode) {
108-
return instrument("batchGetUnion.keys_" + keys.size(),
141+
return instrument("batchGetUnion", null, bucketCount(keys.size()),
109142
() -> _delegate.batchGetUnion(keys, keysCount, position, includeSoftDeleted, isTestMode));
110143
}
111144

112145
@Override
113146
public int softDeleteAsset(@Nonnull URN urn, boolean isTestMode) {
114-
return instrument("softDeleteAsset", () -> _delegate.softDeleteAsset(urn, isTestMode));
147+
return instrument("softDeleteAsset", null, null,
148+
() -> _delegate.softDeleteAsset(urn, isTestMode));
115149
}
116150

117151
@Override
118-
public Map<URN, EntityDeletionInfo> readDeletionInfoBatch(@Nonnull List<URN> urns, boolean isTestMode) {
119-
return instrument("readDeletionInfoBatch.urns_" + urns.size(),
152+
public Map<URN, EntityDeletionInfo> readDeletionInfoBatch(@Nonnull List<URN> urns,
153+
boolean isTestMode) {
154+
return instrument("readDeletionInfoBatch", null, bucketCount(urns.size()),
120155
() -> _delegate.readDeletionInfoBatch(urns, isTestMode));
121156
}
122157

123158
@Override
124-
public int batchSoftDeleteAssets(@Nonnull List<URN> urns, @Nonnull String cutoffTimestamp, boolean isTestMode) {
125-
return instrument("batchSoftDeleteAssets.urns_" + urns.size(),
159+
public int batchSoftDeleteAssets(@Nonnull List<URN> urns, @Nonnull String cutoffTimestamp,
160+
boolean isTestMode) {
161+
return instrument("batchSoftDeleteAssets", null, bucketCount(urns.size()),
126162
() -> _delegate.batchSoftDeleteAssets(urns, cutoffTimestamp, isTestMode));
127163
}
128164

129165
@Override
130166
public List<URN> listUrns(@Nullable IndexFilter indexFilter,
131167
@Nullable IndexSortCriterion indexSortCriterion, @Nullable URN lastUrn, int pageSize) {
132-
return instrument("listUrns.cursor",
168+
return instrument("listUrns.cursor", null, null,
133169
() -> _delegate.listUrns(indexFilter, indexSortCriterion, lastUrn, pageSize));
134170
}
135171

136172
@Override
137173
public ListResult<URN> listUrns(@Nullable IndexFilter indexFilter,
138174
@Nullable IndexSortCriterion indexSortCriterion, int start, int pageSize) {
139-
return instrument("listUrns.offset",
175+
return instrument("listUrns.offset", null, null,
140176
() -> _delegate.listUrns(indexFilter, indexSortCriterion, start, pageSize));
141177
}
142178

143179
@Override
144180
public boolean exists(@Nonnull URN urn) {
145-
return instrument("exists", () -> _delegate.exists(urn));
181+
return instrument("exists", null, null, () -> _delegate.exists(urn));
146182
}
147183

148184
@Nonnull
149185
@Override
150186
public Map<String, Long> countAggregate(@Nullable IndexFilter indexFilter,
151187
@Nonnull IndexGroupByCriterion indexGroupByCriterion) {
152-
return instrument("countAggregate",
188+
return instrument("countAggregate", null, null,
153189
() -> _delegate.countAggregate(indexFilter, indexGroupByCriterion));
154190
}
155191

156192
@Nonnull
157193
@Override
158194
public <ASPECT extends RecordTemplate> ListResult<URN> listUrns(@Nonnull Class<ASPECT> aspectClass,
159195
int start, int pageSize) {
160-
return instrument("listUrns", () -> _delegate.listUrns(aspectClass, start, pageSize));
196+
return instrument("listUrns", null, null,
197+
() -> _delegate.listUrns(aspectClass, start, pageSize));
161198
}
162199

163200
@Nonnull
164201
@Override
165202
public <ASPECT extends RecordTemplate> ListResult<ASPECT> list(@Nonnull Class<ASPECT> aspectClass,
166203
@Nonnull URN urn, int start, int pageSize) {
167-
return instrument("list", () -> _delegate.list(aspectClass, urn, start, pageSize));
204+
return instrument("list", null, null, () -> _delegate.list(aspectClass, urn, start, pageSize));
168205
}
169206

170207
@Nonnull
171208
@Override
172209
public <ASPECT extends RecordTemplate> ListResult<ASPECT> list(@Nonnull Class<ASPECT> aspectClass,
173210
int start, int pageSize) {
174-
return instrument("list", () -> _delegate.list(aspectClass, start, pageSize));
211+
return instrument("list", null, null, () -> _delegate.list(aspectClass, start, pageSize));
175212
}
176213

177214
@Override
@@ -181,21 +218,26 @@ public void ensureSchemaUpToDate() {
181218
}
182219

183220
/**
184-
* Core instrumentation wrapper. When metrics are disabled, delegates directly.
185-
* When enabled, times the operation and records latency on success, latency + error on failure.
221+
* Core instrumentation wrapper. When metrics are disabled, delegates directly. When enabled,
222+
* times the operation and emits one {@code recordOperation} call in the {@code finally} block
223+
* with status and (on failure) error class populated.
186224
*/
187-
private <T> T instrument(@Nonnull String operationType, @Nonnull Supplier<T> supplier) {
225+
private <T> T instrument(@Nonnull String operation, @Nullable String aspect,
226+
@Nullable String countBucket, @Nonnull Supplier<T> supplier) {
188227
if (!_metrics.isEnabled()) {
189228
return supplier.get();
190229
}
191230
final long startNanos = System.nanoTime();
231+
String status = STATUS_SUCCESS;
232+
String errorClass = null;
192233
try {
193234
return supplier.get();
194235
} catch (RuntimeException ex) {
195-
_metrics.recordOperationError(operationType, _entityType, ex.getClass().getSimpleName());
236+
status = STATUS_FAILURE;
237+
errorClass = ex.getClass().getSimpleName();
196238
throw ex;
197239
} finally {
198-
_metrics.recordOperationLatency(operationType, _entityType,
240+
_metrics.recordOperation(operation, _entityType, aspect, countBucket, status, errorClass,
199241
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos));
200242
}
201243
}

0 commit comments

Comments
 (0)