Skip to content

feat: [Internal] open telemetry built in metrics for GRPC #3709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 17, 2025
13 changes: 10 additions & 3 deletions google-cloud-spanner/clirr-ignored-differences.xml
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,13 @@
<method>boolean isEnableBuiltInMetrics()</method>
</difference>

<!-- Added Built In GRPC Metrics option -->
<difference>
<differenceType>7012</differenceType>
<className>com/google/cloud/spanner/SpannerOptions$SpannerEnvironment</className>
<method>boolean isEnableGRPCBuiltInMetrics()</method>
</difference>

<!-- Added Monitoring host option -->
<difference>
<differenceType>7012</differenceType>
Expand Down Expand Up @@ -807,7 +814,7 @@
<className>com/google/cloud/spanner/connection/Connection</className>
<method>boolean isKeepTransactionAlive()</method>
</difference>

<!-- Automatic DML batching -->
<difference>
<differenceType>7012</differenceType>
Expand Down Expand Up @@ -839,7 +846,7 @@
<className>com/google/cloud/spanner/connection/Connection</className>
<method>boolean isAutoBatchDmlUpdateCountVerification()</method>
</difference>

<!-- Retry DML as Partitioned DML -->
<difference>
<differenceType>7012</differenceType>
Expand All @@ -863,7 +870,7 @@
<className>com/google/cloud/spanner/connection/Connection</className>
<method>java.lang.Object runTransaction(com.google.cloud.spanner.connection.Connection$TransactionCallable)</method>
</difference>

<!-- Added experimental host option -->
<difference>
<differenceType>7012</differenceType>
Expand Down
4 changes: 4 additions & 0 deletions google-cloud-spanner/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@
<groupId>io.grpc</groupId>
<artifactId>grpc-stub</artifactId>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-opentelemetry</artifactId>
</dependency>
<dependency>
<groupId>com.google.api</groupId>
<artifactId>api-common</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import io.opentelemetry.sdk.metrics.InstrumentSelector;
import io.opentelemetry.sdk.metrics.InstrumentType;
import io.opentelemetry.sdk.metrics.View;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
Expand All @@ -36,6 +37,7 @@ public class BuiltInMetricsConstant {
public static final String METER_NAME = "spanner.googleapis.com/internal/client";
public static final String GAX_METER_NAME = OpenTelemetryMetricsRecorder.GAX_METER_NAME;
static final String SPANNER_METER_NAME = "spanner-java";
static final String GRPC_METER_NAME = "grpc-java";
static final String GFE_LATENCIES_NAME = "gfe_latencies";
static final String OPERATION_LATENCIES_NAME = "operation_latencies";
static final String ATTEMPT_LATENCIES_NAME = "attempt_latencies";
Expand All @@ -55,6 +57,14 @@ public class BuiltInMetricsConstant {
.map(m -> METER_NAME + '/' + m)
.collect(Collectors.toSet());

static final Collection<String> GRPC_METRICS_TO_ENABLE =
ImmutableList.of(
"grpc.lb.rls.default_target_picks",
"grpc.lb.rls.target_picks",
"grpc.xds_client.server_failure",
"grpc.xds_client.resource_updates_invalid",
"grpc.xds_client.resource_updates_valid");

public static final String SPANNER_RESOURCE_TYPE = "spanner_instance_client";

public static final AttributeKey<String> PROJECT_ID_KEY = AttributeKey.stringKey("project_id");
Expand All @@ -66,12 +76,7 @@ public class BuiltInMetricsConstant {

// These metric labels will be promoted to the spanner monitored resource fields
public static final Set<AttributeKey<String>> SPANNER_PROMOTED_RESOURCE_LABELS =
ImmutableSet.of(
PROJECT_ID_KEY,
INSTANCE_ID_KEY,
INSTANCE_CONFIG_ID_KEY,
LOCATION_ID_KEY,
CLIENT_HASH_KEY);
ImmutableSet.of(INSTANCE_ID_KEY);

public static final AttributeKey<String> DATABASE_KEY = AttributeKey.stringKey("database");
public static final AttributeKey<String> CLIENT_UID_KEY = AttributeKey.stringKey("client_uid");
Expand Down Expand Up @@ -102,6 +107,9 @@ public class BuiltInMetricsConstant {
DIRECT_PATH_ENABLED_KEY,
DIRECT_PATH_USED_KEY);

static final Set<String> GRPC_LB_RLS_ATTRIBUTES =
ImmutableSet.of("grpc.lb.rls.data_plane_target", "grpc.lb.pick_result");

static Aggregation AGGREGATION_WITH_MILLIS_HISTOGRAM =
Aggregation.explicitBucketHistogram(
ImmutableList.of(
Expand All @@ -111,6 +119,14 @@ public class BuiltInMetricsConstant {
10000.0, 20000.0, 50000.0, 100000.0, 200000.0, 400000.0, 800000.0, 1600000.0,
3200000.0));

static final Collection<String> GRPC_METRICS_ENABLED_BY_DEFAULT =
ImmutableList.of(
"grpc.client.attempt.sent_total_compressed_message_size",
"grpc.client.attempt.rcvd_total_compressed_message_size",
"grpc.client.attempt.started",
"grpc.client.attempt.duration",
"grpc.client.call.duration");

static Map<InstrumentSelector, View> getAllViews() {
ImmutableMap.Builder<InstrumentSelector, View> views = ImmutableMap.builder();
defineView(
Expand Down Expand Up @@ -153,6 +169,7 @@ static Map<InstrumentSelector, View> getAllViews() {
Aggregation.sum(),
InstrumentType.COUNTER,
"1");
defineGRPCView(views);
return views.build();
}

Expand Down Expand Up @@ -183,4 +200,26 @@ private static void defineView(
.build();
viewMap.put(selector, view);
}

private static void defineGRPCView(ImmutableMap.Builder<InstrumentSelector, View> viewMap) {
for (String metric : BuiltInMetricsConstant.GRPC_METRICS_TO_ENABLE) {
InstrumentSelector selector =
InstrumentSelector.builder()
.setName(metric)
.setMeterName(BuiltInMetricsConstant.GRPC_METER_NAME)
.build();
Set<String> attributesFilter =
BuiltInMetricsConstant.COMMON_ATTRIBUTES.stream()
.map(AttributeKey::getKey)
.collect(Collectors.toSet());
attributesFilter.addAll(BuiltInMetricsConstant.GRPC_LB_RLS_ATTRIBUTES);

View view =
View.builder()
.setName(BuiltInMetricsConstant.METER_NAME + '/' + metric.replace(".", "/"))
.setAttributeFilter(attributesFilter)
.build();
viewMap.put(selector, view);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,28 @@
import static com.google.cloud.spanner.BuiltInMetricsConstant.CLIENT_NAME_KEY;
import static com.google.cloud.spanner.BuiltInMetricsConstant.CLIENT_UID_KEY;
import static com.google.cloud.spanner.BuiltInMetricsConstant.INSTANCE_CONFIG_ID_KEY;
import static com.google.cloud.spanner.BuiltInMetricsConstant.INSTANCE_ID_KEY;
import static com.google.cloud.spanner.BuiltInMetricsConstant.LOCATION_ID_KEY;
import static com.google.cloud.spanner.BuiltInMetricsConstant.PROJECT_ID_KEY;

import com.google.api.core.ApiFunction;
import com.google.api.gax.core.GaxProperties;
import com.google.api.gax.grpc.InstantiatingGrpcChannelProvider;
import com.google.auth.Credentials;
import com.google.cloud.opentelemetry.detection.AttributeKeys;
import com.google.cloud.opentelemetry.detection.DetectedPlatform;
import com.google.cloud.opentelemetry.detection.GCPPlatformDetector;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import io.grpc.ManagedChannelBuilder;
import io.grpc.opentelemetry.GrpcOpenTelemetry;
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.common.Attributes;
import io.opentelemetry.api.common.AttributesBuilder;
import io.opentelemetry.sdk.OpenTelemetrySdk;
import io.opentelemetry.sdk.metrics.SdkMeterProvider;
import io.opentelemetry.sdk.metrics.SdkMeterProviderBuilder;
import io.opentelemetry.sdk.resources.Resource;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.reflect.Method;
Expand Down Expand Up @@ -66,6 +75,7 @@ OpenTelemetry getOrCreateOpenTelemetry(
BuiltInMetricsView.registerBuiltinMetrics(
SpannerCloudMonitoringExporter.create(projectId, credentials, monitoringHost),
sdkMeterProviderBuilder);
sdkMeterProviderBuilder.setResource(Resource.create(createResourceAttributes(projectId)));
SdkMeterProvider sdkMeterProvider = sdkMeterProviderBuilder.build();
this.openTelemetry = OpenTelemetrySdk.builder().setMeterProvider(sdkMeterProvider).build();
Runtime.getRuntime().addShutdownHook(new Thread(sdkMeterProvider::close));
Expand All @@ -80,15 +90,47 @@ OpenTelemetry getOrCreateOpenTelemetry(
}
}

Map<String, String> createClientAttributes(String projectId, String client_name) {
void enableGrpcMetrics(
InstantiatingGrpcChannelProvider.Builder channelProviderBuilder,
String projectId,
@Nullable Credentials credentials,
@Nullable String monitoringHost) {
GrpcOpenTelemetry grpcOpenTelemetry =
GrpcOpenTelemetry.newBuilder()
.sdk(this.getOrCreateOpenTelemetry(projectId, credentials, monitoringHost))
.enableMetrics(BuiltInMetricsConstant.GRPC_METRICS_TO_ENABLE)
// Disable gRPCs default metrics as they are not needed for Spanner.
.disableMetrics(BuiltInMetricsConstant.GRPC_METRICS_ENABLED_BY_DEFAULT)
.build();
ApiFunction<ManagedChannelBuilder, ManagedChannelBuilder> channelConfigurator =
channelProviderBuilder.getChannelConfigurator();
channelProviderBuilder.setChannelConfigurator(
b -> {
grpcOpenTelemetry.configureChannelBuilder(b);
if (channelConfigurator != null) {
return channelConfigurator.apply(b);
}
return b;
});
}

Attributes createResourceAttributes(String projectId) {
AttributesBuilder attributesBuilder =
Attributes.builder()
.put(PROJECT_ID_KEY.getKey(), projectId)
.put(INSTANCE_CONFIG_ID_KEY.getKey(), "unknown")
.put(CLIENT_HASH_KEY.getKey(), generateClientHash(getDefaultTaskValue()))
.put(INSTANCE_ID_KEY.getKey(), "unknown")
.put(LOCATION_ID_KEY.getKey(), detectClientLocation());

return attributesBuilder.build();
}

Map<String, String> createClientAttributes() {
Map<String, String> clientAttributes = new HashMap<>();
clientAttributes.put(LOCATION_ID_KEY.getKey(), detectClientLocation());
clientAttributes.put(PROJECT_ID_KEY.getKey(), projectId);
clientAttributes.put(INSTANCE_CONFIG_ID_KEY.getKey(), "unknown");
clientAttributes.put(CLIENT_NAME_KEY.getKey(), client_name);
String clientUid = getDefaultTaskValue();
clientAttributes.put(CLIENT_UID_KEY.getKey(), clientUid);
clientAttributes.put(CLIENT_HASH_KEY.getKey(), generateClientHash(clientUid));
clientAttributes.put(
CLIENT_NAME_KEY.getKey(), "spanner-java/" + GaxProperties.getLibraryVersion(getClass()));
clientAttributes.put(CLIENT_UID_KEY.getKey(), getDefaultTaskValue());
return clientAttributes;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package com.google.cloud.spanner;

import static com.google.cloud.spanner.BuiltInMetricsConstant.SPANNER_METRICS;

import com.google.api.core.ApiFuture;
import com.google.api.core.ApiFutureCallback;
import com.google.api.core.ApiFutures;
Expand All @@ -39,8 +37,8 @@
import io.opentelemetry.sdk.metrics.InstrumentType;
import io.opentelemetry.sdk.metrics.data.AggregationTemporality;
import io.opentelemetry.sdk.metrics.data.MetricData;
import io.opentelemetry.sdk.metrics.data.PointData;
import io.opentelemetry.sdk.metrics.export.MetricExporter;
import io.opentelemetry.sdk.resources.Resource;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
Expand Down Expand Up @@ -114,27 +112,19 @@ public CompletableResultCode export(@Nonnull Collection<MetricData> collection)

/** Export client built in metrics */
private CompletableResultCode exportSpannerClientMetrics(Collection<MetricData> collection) {
// Filter spanner metrics. Only include metrics that contain a project and instance ID.
List<MetricData> spannerMetricData =
collection.stream()
.filter(md -> SPANNER_METRICS.contains(md.getName()))
.collect(Collectors.toList());
// Filter spanner metrics. Only include metrics that contain a valid project.
List<MetricData> spannerMetricData = collection.stream().collect(Collectors.toList());

// Log warnings for metrics that will be skipped.
boolean mustFilter = false;
if (spannerMetricData.stream()
.flatMap(metricData -> metricData.getData().getPoints().stream())
.map(metricData -> metricData.getResource())
.anyMatch(this::shouldSkipPointDataDueToProjectId)) {
logger.log(
Level.WARNING, "Some metric data contain a different projectId. These will be skipped.");
mustFilter = true;
}
if (spannerMetricData.stream()
.flatMap(metricData -> metricData.getData().getPoints().stream())
.anyMatch(this::shouldSkipPointDataDueToMissingInstanceId)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious - why are we skipping this. Did we fix the issue that a instance id could be missing (because we will set "unknown" by default?).

My concern is that if there is a bug that causes the instance id not to be correctly populated, we won't be able to catch that regression.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Earlier instance_id was only getting set from HeaderInterceptor, so there were cases when instance_id was null in cases when we dont receive a response header.
Now since we have added a default "unknown", we don't need this check.

We can catch regression by ITBuiltInMetricsTest which checks for data as per the filtered query and one of the filter is "instance_id"

logger.log(Level.WARNING, "Some metric data miss instanceId. These will be skipped.");
mustFilter = true;
}

if (mustFilter) {
spannerMetricData =
spannerMetricData.stream()
Expand Down Expand Up @@ -198,19 +188,11 @@ public void onSuccess(List<Empty> empty) {
}

private boolean shouldSkipMetricData(MetricData metricData) {
return metricData.getData().getPoints().stream()
.anyMatch(
pd ->
shouldSkipPointDataDueToProjectId(pd)
|| shouldSkipPointDataDueToMissingInstanceId(pd));
}

private boolean shouldSkipPointDataDueToProjectId(PointData pointData) {
return !spannerProjectId.equals(SpannerCloudMonitoringExporterUtils.getProjectId(pointData));
return shouldSkipPointDataDueToProjectId(metricData.getResource());
}

private boolean shouldSkipPointDataDueToMissingInstanceId(PointData pointData) {
return SpannerCloudMonitoringExporterUtils.getInstanceId(pointData) == null;
private boolean shouldSkipPointDataDueToProjectId(Resource resource) {
return !spannerProjectId.equals(SpannerCloudMonitoringExporterUtils.getProjectId(resource));
}

boolean lastExportSkippedData() {
Expand Down
Loading
Loading