Skip to content

Commit 84bdeb3

Browse files
authored
[sdk-metrics] Fix race condition for MemoryPoint Reclaim (#5546)
1 parent b8729a0 commit 84bdeb3

File tree

3 files changed

+37
-1
lines changed

3 files changed

+37
-1
lines changed

src/OpenTelemetry/CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
function when configuring a view (applies to individual metrics).
88
([#5542](https://github.com/open-telemetry/opentelemetry-dotnet/pull/5542))
99

10+
* Fixed a race condition for the experimental MetricPoint reclaim scenario
11+
(enabled via `OTEL_DOTNET_EXPERIMENTAL_METRICS_RECLAIM_UNUSED_METRIC_POINTS`)
12+
which could have led to a measurement being dropped.
13+
([#5546](https://github.com/open-telemetry/opentelemetry-dotnet/pull/5546))
14+
1015
## 1.8.1
1116

1217
Released 2024-Apr-17

src/OpenTelemetry/Metrics/AggregatorStore.cs

+30-1
Original file line numberDiff line numberDiff line change
@@ -265,12 +265,41 @@ internal void SnapshotDeltaWithMetricPointReclaim()
265265

266266
if (metricPoint.MetricPointStatus == MetricPointStatus.NoCollectPending)
267267
{
268+
// Reclaim the MetricPoint if it was marked for it in the previous collect cycle
269+
if (metricPoint.LookupData != null && metricPoint.LookupData.DeferredReclaim == true)
270+
{
271+
this.ReclaimMetricPoint(ref metricPoint, i);
272+
continue;
273+
}
274+
275+
// Check if the MetricPoint could be reclaimed in the current Collect cycle.
268276
// If metricPoint.LookupData is `null` then the MetricPoint is already reclaimed and in the queue.
269277
// If the Collect thread is successfully able to compare and swap the reference count from zero to int.MinValue, it means that
270278
// the MetricPoint can be reused for other tags.
271279
if (metricPoint.LookupData != null && Interlocked.CompareExchange(ref metricPoint.ReferenceCount, int.MinValue, 0) == 0)
272280
{
273-
this.ReclaimMetricPoint(ref metricPoint, i);
281+
// This is similar to double-checked locking. For some rare case, the Collect thread might read the status as `NoCollectPending`,
282+
// and then get switched out before it could set the ReferenceCount to `int.MinValue`. In the meantime, an Update thread could come in
283+
// and update the MetricPoint, thereby, setting its status to `CollectPending`. Note that the ReferenceCount would be 0 after the update.
284+
// If the Collect thread now wakes up, it would be able to set the ReferenceCount to `int.MinValue`, thereby, marking the MetricPoint
285+
// invalid for newer updates. In such cases, the MetricPoint, should not be reclaimed before taking its Snapshot.
286+
287+
if (metricPoint.MetricPointStatus == MetricPointStatus.NoCollectPending)
288+
{
289+
this.ReclaimMetricPoint(ref metricPoint, i);
290+
}
291+
else
292+
{
293+
// MetricPoint's ReferenceCount is `int.MinValue` but it still has a collect pending. Take the MetricPoint's Snapshot
294+
// and mark it to be reclaimed in the next Collect cycle.
295+
296+
metricPoint.LookupData.DeferredReclaim = true;
297+
298+
this.TakeMetricPointSnapshot(ref metricPoint, outputDelta: true);
299+
300+
this.currentMetricPointBatch[this.batchSize] = i;
301+
this.batchSize++;
302+
}
274303
}
275304

276305
continue;

src/OpenTelemetry/Metrics/LookupData.cs

+2
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ namespace OpenTelemetry.Metrics;
55

66
internal sealed class LookupData
77
{
8+
public bool DeferredReclaim;
89
public int Index;
910
public Tags SortedTags;
1011
public Tags GivenTags;
1112

1213
public LookupData(int index, in Tags sortedTags, in Tags givenTags)
1314
{
15+
this.DeferredReclaim = false;
1416
this.Index = index;
1517
this.SortedTags = sortedTags;
1618
this.GivenTags = givenTags;

0 commit comments

Comments
 (0)