@@ -29,6 +29,59 @@ type MonitoringCollector struct {
29
29
lastScrapeDurationSecondsMetric prometheus.Gauge
30
30
collectorFillMissingLabels bool
31
31
monitoringDropDelegatedProjects bool
32
+
33
+ cache * CollectionCache
34
+ }
35
+
36
+ type CollectionCache struct {
37
+ // This map holds the read-only result of a collection run
38
+ // It will be served from the promethus scrape endpoint until the next
39
+ // collection is complete.
40
+ cachedTimeSeries map [string ]* TimeSeriesMetrics
41
+
42
+ // This map holds the (potentially incomplete) metrics that have been collected.
43
+ // Once completed it will replace the `cachedTimeSeries` and will start being served.
44
+ activeTimeSeries map [string ]* TimeSeriesMetrics
45
+
46
+ // Indicates whether there is a collection currently running, and populating `activeTimeSeries`
47
+ // at the moment.
48
+ collectionActive bool
49
+
50
+ // Guards `activeTimeSeries` and `collectionActive`
51
+ mu sync.Mutex
52
+ }
53
+
54
+ // Update the cache state to indicate that a collection has started
55
+ func (c * CollectionCache ) markCollectionStarted () {
56
+ log .Debugf ("markCollectionStarted" )
57
+ c .mu .Lock ()
58
+ c .collectionActive = true
59
+ c .mu .Unlock ()
60
+ }
61
+
62
+ // Update the cache state to indicate that a collection has completed
63
+ func (c * CollectionCache ) markCollectionCompleted () {
64
+ log .Debugf ("markCollectionCompleted" )
65
+ c .mu .Lock ()
66
+ defer c .mu .Unlock ()
67
+ collected := c .activeTimeSeries
68
+ c .cachedTimeSeries = collected
69
+ c .activeTimeSeries = make (map [string ]* TimeSeriesMetrics )
70
+ c .collectionActive = false
71
+ }
72
+
73
+ // Check if there is a collection running int he background
74
+ func (c * CollectionCache ) isCollectionActive () bool {
75
+ c .mu .Lock ()
76
+ defer c .mu .Unlock ()
77
+ return c .collectionActive
78
+ }
79
+
80
+ // During a collection, this func should be used to save the collected data
81
+ func (c * CollectionCache ) putMetric (metricType string , timeSeries * TimeSeriesMetrics ) {
82
+ c .mu .Lock ()
83
+ c .activeTimeSeries [metricType ] = timeSeries
84
+ c .mu .Unlock ()
32
85
}
33
86
34
87
func NewMonitoringCollector (
@@ -114,6 +167,11 @@ func NewMonitoringCollector(
114
167
lastScrapeDurationSecondsMetric : lastScrapeDurationSecondsMetric ,
115
168
collectorFillMissingLabels : collectorFillMissingLabels ,
116
169
monitoringDropDelegatedProjects : monitoringDropDelegatedProjects ,
170
+ cache : & CollectionCache {
171
+ cachedTimeSeries : make (map [string ]* TimeSeriesMetrics ),
172
+ activeTimeSeries : make (map [string ]* TimeSeriesMetrics ),
173
+ collectionActive : false ,
174
+ },
117
175
}
118
176
119
177
return monitoringCollector , nil
@@ -129,32 +187,41 @@ func (c *MonitoringCollector) Describe(ch chan<- *prometheus.Desc) {
129
187
}
130
188
131
189
func (c * MonitoringCollector ) Collect (ch chan <- prometheus.Metric ) {
132
- var begun = time .Now ()
133
190
134
- errorMetric := float64 (0 )
135
- if err := c .reportMonitoringMetrics (ch ); err != nil {
136
- errorMetric = float64 (1 )
137
- c .scrapeErrorsTotalMetric .Inc ()
138
- log .Errorf ("Error while getting Google Stackdriver Monitoring metrics: %s" , err )
191
+ for _ , timeSeries := range c .cache .cachedTimeSeries {
192
+ timeSeries .Complete (ch )
139
193
}
140
- c .scrapeErrorsTotalMetric .Collect (ch )
141
194
195
+ c .scrapeErrorsTotalMetric .Collect (ch )
142
196
c .apiCallsTotalMetric .Collect (ch )
143
-
144
- c .scrapesTotalMetric .Inc ()
145
197
c .scrapesTotalMetric .Collect (ch )
146
-
147
- c .lastScrapeErrorMetric .Set (errorMetric )
148
198
c .lastScrapeErrorMetric .Collect (ch )
149
-
150
- c .lastScrapeTimestampMetric .Set (float64 (time .Now ().Unix ()))
151
199
c .lastScrapeTimestampMetric .Collect (ch )
152
-
153
- c .lastScrapeDurationSecondsMetric .Set (time .Since (begun ).Seconds ())
154
200
c .lastScrapeDurationSecondsMetric .Collect (ch )
201
+
202
+ if ! c .cache .isCollectionActive () {
203
+ go func () {
204
+ start := time .Now ()
205
+ errorMetric := float64 (0 )
206
+
207
+ c .cache .markCollectionStarted ()
208
+ if err := c .updateMetricsCache (); err != nil {
209
+ errorMetric = float64 (1 )
210
+ c .scrapeErrorsTotalMetric .Inc ()
211
+ log .Errorf ("Error while getting Google Stackdriver Monitoring metrics: %s" , err )
212
+ }
213
+
214
+ c .scrapesTotalMetric .Inc ()
215
+ c .lastScrapeErrorMetric .Set (errorMetric )
216
+ c .lastScrapeTimestampMetric .Set (float64 (time .Now ().Unix ()))
217
+ c .lastScrapeDurationSecondsMetric .Set (time .Since (start ).Seconds ())
218
+
219
+ c .cache .markCollectionCompleted ()
220
+ }()
221
+ }
155
222
}
156
223
157
- func (c * MonitoringCollector ) reportMonitoringMetrics ( ch chan <- prometheus. Metric ) error {
224
+ func (c * MonitoringCollector ) updateMetricsCache ( ) error {
158
225
metricDescriptorsFunction := func (page * monitoring.ListMetricDescriptorsResponse ) error {
159
226
var wg = & sync.WaitGroup {}
160
227
@@ -181,7 +248,7 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
181
248
182
249
for _ , metricDescriptor := range uniqueDescriptors {
183
250
wg .Add (1 )
184
- go func (metricDescriptor * monitoring.MetricDescriptor , ch chan <- prometheus. Metric ) {
251
+ go func (metricDescriptor * monitoring.MetricDescriptor ) {
185
252
defer wg .Done ()
186
253
log .Debugf ("Retrieving Google Stackdriver Monitoring metrics for descriptor `%s`..." , metricDescriptor .Type )
187
254
filter := fmt .Sprintf ("metric.type=\" %s\" " , metricDescriptor .Type )
@@ -193,9 +260,13 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
193
260
}
194
261
timeSeriesListCall := c .monitoringService .Projects .TimeSeries .List (utils .ProjectResource (c .projectID )).
195
262
Filter (filter ).
263
+ PageSize (100000 ).
196
264
IntervalStartTime (startTime .Format (time .RFC3339Nano )).
197
265
IntervalEndTime (endTime .Format (time .RFC3339Nano ))
198
266
267
+ pageNumber := 0
268
+
269
+ start := time .Now ()
199
270
for {
200
271
c .apiCallsTotalMetric .Inc ()
201
272
page , err := timeSeriesListCall .Do ()
@@ -204,20 +275,26 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
204
275
errChannel <- err
205
276
break
206
277
}
278
+
207
279
if page == nil {
208
280
break
209
281
}
210
- if err := c .reportTimeSeriesMetrics (page , metricDescriptor , ch ); err != nil {
282
+ if err := c .updateMetricsCacheForMetric (page , metricDescriptor ); err != nil {
211
283
log .Errorf ("Error reporting Time Series metrics for descriptor `%s`: %v" , metricDescriptor .Type , err )
212
284
errChannel <- err
213
285
break
214
286
}
215
287
if page .NextPageToken == "" {
216
288
break
217
289
}
290
+ pageNumber ++
218
291
timeSeriesListCall .PageToken (page .NextPageToken )
219
292
}
220
- }(metricDescriptor , ch )
293
+
294
+ elapsed := time .Since (start )
295
+ log .Debugf ("Took %s to retrieve %v pages for metric descriptor %s" , elapsed , pageNumber + 1 , metricDescriptor .Type )
296
+
297
+ }(metricDescriptor )
221
298
}
222
299
223
300
wg .Wait ()
@@ -257,18 +334,15 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
257
334
return <- errChannel
258
335
}
259
336
260
- func (c * MonitoringCollector ) reportTimeSeriesMetrics (
337
+ func (c * MonitoringCollector ) updateMetricsCacheForMetric (
261
338
page * monitoring.ListTimeSeriesResponse ,
262
- metricDescriptor * monitoring.MetricDescriptor ,
263
- ch chan <- prometheus.Metric ,
264
- ) error {
339
+ metricDescriptor * monitoring.MetricDescriptor ) error {
265
340
var metricValue float64
266
341
var metricValueType prometheus.ValueType
267
342
var newestTSPoint * monitoring.Point
268
343
269
344
timeSeriesMetrics := & TimeSeriesMetrics {
270
345
metricDescriptor : metricDescriptor ,
271
- ch : ch ,
272
346
fillMissingLabels : c .collectorFillMissingLabels ,
273
347
constMetrics : make (map [string ][]ConstMetric ),
274
348
histogramMetrics : make (map [string ][]HistogramMetric ),
@@ -354,7 +428,7 @@ func (c *MonitoringCollector) reportTimeSeriesMetrics(
354
428
355
429
timeSeriesMetrics .CollectNewConstMetric (timeSeries , labelKeys , metricValueType , metricValue , labelValues )
356
430
}
357
- timeSeriesMetrics . Complete ( )
431
+ c . cache . putMetric ( metricDescriptor . Type , timeSeriesMetrics )
358
432
return nil
359
433
}
360
434
0 commit comments