Skip to content

Commit cf645f3

Browse files
authored
Fix hash value for Prometheus histogram metrics. (#175)
Through internal investigation, I observed that one of the queue length is consistently high and blocked the whole queue manager. I identified that there are large volume of points with hash value 0 tried to be enqueued on to shard #0, which makes the sharding inbalance and the sharded queue are constantly blocked by shard #0. When Stackdriver Prometheus Sidecar tries to transform Prometheus histogram metrics into Stackdriver timeseries with value type distribution, it should return the hash values derived by series cache rather than 0. This change fixes the undesired behavior.
1 parent cc5ee91 commit cf645f3

File tree

2 files changed

+22
-13
lines changed

2 files changed

+22
-13
lines changed

retrieval/transform.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,22 @@ type sampleBuilder struct {
3838
// the remainder of the input.
3939
func (b *sampleBuilder) next(ctx context.Context, samples []tsdb.RefSample) (*monitoring_pb.TimeSeries, uint64, []tsdb.RefSample, error) {
4040
sample := samples[0]
41+
tailSamples := samples[1:]
4142

4243
entry, ok, err := b.series.get(ctx, sample.Ref)
4344
if err != nil {
4445
return nil, 0, samples, errors.Wrap(err, "get series information")
4546
}
4647
if !ok {
47-
return nil, 0, samples[1:], nil
48+
return nil, 0, tailSamples, nil
4849
}
4950

5051
if entry.tracker != nil {
5152
entry.tracker.newPoint(ctx, entry.lset, sample.T, sample.V)
5253
}
5354

5455
if !entry.exported {
55-
return nil, 0, samples[1:], nil
56+
return nil, 0, tailSamples, nil
5657
}
5758
// Get a shallow copy of the proto so we can overwrite the point field
5859
// and safely send it into the remote queues.
@@ -72,7 +73,7 @@ func (b *sampleBuilder) next(ctx context.Context, samples []tsdb.RefSample) (*mo
7273
var v float64
7374
resetTimestamp, v, ok = b.series.getResetAdjusted(sample.Ref, sample.T, sample.V)
7475
if !ok {
75-
return nil, 0, samples[1:], nil
76+
return nil, 0, tailSamples, nil
7677
}
7778
point.Interval.StartTime = getTimestamp(resetTimestamp)
7879
point.Value = &monitoring_pb.TypedValue{Value: &monitoring_pb.TypedValue_DoubleValue{v}}
@@ -86,46 +87,45 @@ func (b *sampleBuilder) next(ctx context.Context, samples []tsdb.RefSample) (*mo
8687
var v float64
8788
resetTimestamp, v, ok = b.series.getResetAdjusted(sample.Ref, sample.T, sample.V)
8889
if !ok {
89-
return nil, 0, samples[1:], nil
90+
return nil, 0, tailSamples, nil
9091
}
9192
point.Interval.StartTime = getTimestamp(resetTimestamp)
9293
point.Value = &monitoring_pb.TypedValue{Value: &monitoring_pb.TypedValue_DoubleValue{v}}
9394
case metricSuffixCount:
9495
var v float64
9596
resetTimestamp, v, ok = b.series.getResetAdjusted(sample.Ref, sample.T, sample.V)
9697
if !ok {
97-
return nil, 0, samples[1:], nil
98+
return nil, 0, tailSamples, nil
9899
}
99100
point.Interval.StartTime = getTimestamp(resetTimestamp)
100101
point.Value = &monitoring_pb.TypedValue{Value: &monitoring_pb.TypedValue_Int64Value{int64(v)}}
101102
case "": // Actual quantiles.
102103
point.Value = &monitoring_pb.TypedValue{Value: &monitoring_pb.TypedValue_DoubleValue{sample.V}}
103104
default:
104-
return nil, 0, samples[1:], errors.Errorf("unexpected metric name suffix %q", entry.suffix)
105+
return nil, 0, tailSamples, errors.Errorf("unexpected metric name suffix %q", entry.suffix)
105106
}
106107

107108
case textparse.MetricTypeHistogram:
108109
// We pass in the original lset for matching since Prometheus's target label must
109110
// be the same as well.
110111
var v *distribution_pb.Distribution
111-
v, resetTimestamp, samples, err = b.buildDistribution(ctx, entry.metadata.Metric, entry.lset, samples)
112+
v, resetTimestamp, tailSamples, err = b.buildDistribution(ctx, entry.metadata.Metric, entry.lset, samples)
112113
if v == nil || err != nil {
113-
return nil, 0, samples, err
114+
return nil, 0, tailSamples, err
114115
}
115116
point.Interval.StartTime = getTimestamp(resetTimestamp)
116117
point.Value = &monitoring_pb.TypedValue{
117118
Value: &monitoring_pb.TypedValue_DistributionValue{v},
118119
}
119-
return &ts, 0, samples, nil
120120

121121
default:
122122
return nil, 0, samples[1:], errors.Errorf("unexpected metric type %s", entry.metadata.Type)
123123
}
124124

125125
if !b.series.updateSampleInterval(entry.hash, resetTimestamp, sample.T) {
126-
return nil, 0, samples[1:], nil
126+
return nil, 0, tailSamples, nil
127127
}
128-
return &ts, entry.hash, samples[1:], nil
128+
return &ts, entry.hash, tailSamples, nil
129129
}
130130

131131
const (

retrieval/transform_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -768,8 +768,10 @@ func TestSampleBuilder(t *testing.T) {
768768
t.Logf("Test case %d", i)
769769

770770
var s *monitoring_pb.TimeSeries
771+
var h uint64
771772
var err error
772773
var result []*monitoring_pb.TimeSeries
774+
var hashes []uint64
773775

774776
aggr, _ := NewCounterAggregator(log.NewNopLogger(), new(CounterAggregatorConfig))
775777
series := newSeriesCache(nil, "", nil, nil, c.targets, c.metadata, resourceMaps, c.metricPrefix, false, aggr)
@@ -780,11 +782,12 @@ func TestSampleBuilder(t *testing.T) {
780782
b := &sampleBuilder{series: series}
781783

782784
for k := 0; len(c.input) > 0; k++ {
783-
s, _, c.input, err = b.next(context.Background(), c.input)
785+
s, h, c.input, err = b.next(context.Background(), c.input)
784786
if err != nil {
785787
break
786788
}
787789
result = append(result, s)
790+
hashes = append(hashes, h)
788791
}
789792
if err == nil && c.fail {
790793
t.Fatal("expected error but got none")
@@ -801,7 +804,13 @@ func TestSampleBuilder(t *testing.T) {
801804
t.Logf("expres %v", c.result)
802805
t.Fatalf("unexpected sample %d: got\n\t%v\nwant\n\t%v", k, res, c.result[k])
803806
}
807+
expectedHash := uint64(0)
808+
if c.result[k] != nil {
809+
expectedHash = hashSeries(c.result[k])
810+
}
811+
if hashes[k] != expectedHash {
812+
t.Fatalf("unexpected hash %v; want %v", hashes[k], expectedHash)
813+
}
804814
}
805-
806815
}
807816
}

0 commit comments

Comments
 (0)