@@ -29,6 +29,59 @@ type MonitoringCollector struct {
2929 lastScrapeDurationSecondsMetric prometheus.Gauge
3030 collectorFillMissingLabels bool
3131 monitoringDropDelegatedProjects bool
32+
33+ cache * CollectionCache
34+ }
35+
36+ type CollectionCache struct {
37+ // This map holds the read-only result of a collection run
38+ // It will be served from the promethus scrape endpoint until the next
39+ // collection is complete.
40+ cachedTimeSeries map [string ]* TimeSeriesMetrics
41+
42+ // This map holds the (potentially incomplete) metrics that have been collected.
43+ // Once completed it will replace the `cachedTimeSeries` and will start being served.
44+ activeTimeSeries map [string ]* TimeSeriesMetrics
45+
46+ // Indicates whether there is a collection currently running, and populating `activeTimeSeries`
47+ // at the moment.
48+ collectionActive bool
49+
50+ // Guards `activeTimeSeries` and `collectionActive`
51+ mu sync.Mutex
52+ }
53+
54+ // Update the cache state to indicate that a collection has started
55+ func (c * CollectionCache ) markCollectionStarted () {
56+ log .Debugf ("markCollectionStarted" )
57+ c .mu .Lock ()
58+ c .collectionActive = true
59+ c .mu .Unlock ()
60+ }
61+
62+ // Update the cache state to indicate that a collection has completed
63+ func (c * CollectionCache ) markCollectionCompleted () {
64+ log .Debugf ("markCollectionCompleted" )
65+ c .mu .Lock ()
66+ defer c .mu .Unlock ()
67+ collected := c .activeTimeSeries
68+ c .cachedTimeSeries = collected
69+ c .activeTimeSeries = make (map [string ]* TimeSeriesMetrics )
70+ c .collectionActive = false
71+ }
72+
73+ // Check if there is a collection running int he background
74+ func (c * CollectionCache ) isCollectionActive () bool {
75+ c .mu .Lock ()
76+ defer c .mu .Unlock ()
77+ return c .collectionActive
78+ }
79+
80+ // During a collection, this func should be used to save the collected data
81+ func (c * CollectionCache ) putMetric (metricType string , timeSeries * TimeSeriesMetrics ) {
82+ c .mu .Lock ()
83+ c .activeTimeSeries [metricType ] = timeSeries
84+ c .mu .Unlock ()
3285}
3386
3487func NewMonitoringCollector (
@@ -114,6 +167,11 @@ func NewMonitoringCollector(
114167 lastScrapeDurationSecondsMetric : lastScrapeDurationSecondsMetric ,
115168 collectorFillMissingLabels : collectorFillMissingLabels ,
116169 monitoringDropDelegatedProjects : monitoringDropDelegatedProjects ,
170+ cache : & CollectionCache {
171+ cachedTimeSeries : make (map [string ]* TimeSeriesMetrics ),
172+ activeTimeSeries : make (map [string ]* TimeSeriesMetrics ),
173+ collectionActive : false ,
174+ },
117175 }
118176
119177 return monitoringCollector , nil
@@ -129,32 +187,41 @@ func (c *MonitoringCollector) Describe(ch chan<- *prometheus.Desc) {
129187}
130188
131189func (c * MonitoringCollector ) Collect (ch chan <- prometheus.Metric ) {
132- var begun = time .Now ()
133190
134- errorMetric := float64 (0 )
135- if err := c .reportMonitoringMetrics (ch ); err != nil {
136- errorMetric = float64 (1 )
137- c .scrapeErrorsTotalMetric .Inc ()
138- log .Errorf ("Error while getting Google Stackdriver Monitoring metrics: %s" , err )
191+ for _ , timeSeries := range c .cache .cachedTimeSeries {
192+ timeSeries .Complete (ch )
139193 }
140- c .scrapeErrorsTotalMetric .Collect (ch )
141194
195+ c .scrapeErrorsTotalMetric .Collect (ch )
142196 c .apiCallsTotalMetric .Collect (ch )
143-
144- c .scrapesTotalMetric .Inc ()
145197 c .scrapesTotalMetric .Collect (ch )
146-
147- c .lastScrapeErrorMetric .Set (errorMetric )
148198 c .lastScrapeErrorMetric .Collect (ch )
149-
150- c .lastScrapeTimestampMetric .Set (float64 (time .Now ().Unix ()))
151199 c .lastScrapeTimestampMetric .Collect (ch )
152-
153- c .lastScrapeDurationSecondsMetric .Set (time .Since (begun ).Seconds ())
154200 c .lastScrapeDurationSecondsMetric .Collect (ch )
201+
202+ if ! c .cache .isCollectionActive () {
203+ go func () {
204+ start := time .Now ()
205+ errorMetric := float64 (0 )
206+
207+ c .cache .markCollectionStarted ()
208+ if err := c .updateMetricsCache (); err != nil {
209+ errorMetric = float64 (1 )
210+ c .scrapeErrorsTotalMetric .Inc ()
211+ log .Errorf ("Error while getting Google Stackdriver Monitoring metrics: %s" , err )
212+ }
213+
214+ c .scrapesTotalMetric .Inc ()
215+ c .lastScrapeErrorMetric .Set (errorMetric )
216+ c .lastScrapeTimestampMetric .Set (float64 (time .Now ().Unix ()))
217+ c .lastScrapeDurationSecondsMetric .Set (time .Since (start ).Seconds ())
218+
219+ c .cache .markCollectionCompleted ()
220+ }()
221+ }
155222}
156223
157- func (c * MonitoringCollector ) reportMonitoringMetrics ( ch chan <- prometheus. Metric ) error {
224+ func (c * MonitoringCollector ) updateMetricsCache ( ) error {
158225 metricDescriptorsFunction := func (page * monitoring.ListMetricDescriptorsResponse ) error {
159226 var wg = & sync.WaitGroup {}
160227
@@ -181,7 +248,7 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
181248
182249 for _ , metricDescriptor := range uniqueDescriptors {
183250 wg .Add (1 )
184- go func (metricDescriptor * monitoring.MetricDescriptor , ch chan <- prometheus. Metric ) {
251+ go func (metricDescriptor * monitoring.MetricDescriptor ) {
185252 defer wg .Done ()
186253 log .Debugf ("Retrieving Google Stackdriver Monitoring metrics for descriptor `%s`..." , metricDescriptor .Type )
187254 filter := fmt .Sprintf ("metric.type=\" %s\" " , metricDescriptor .Type )
@@ -193,9 +260,13 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
193260 }
194261 timeSeriesListCall := c .monitoringService .Projects .TimeSeries .List (utils .ProjectResource (c .projectID )).
195262 Filter (filter ).
263+ PageSize (100000 ).
196264 IntervalStartTime (startTime .Format (time .RFC3339Nano )).
197265 IntervalEndTime (endTime .Format (time .RFC3339Nano ))
198266
267+ pageNumber := 0
268+
269+ start := time .Now ()
199270 for {
200271 c .apiCallsTotalMetric .Inc ()
201272 page , err := timeSeriesListCall .Do ()
@@ -204,20 +275,26 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
204275 errChannel <- err
205276 break
206277 }
278+
207279 if page == nil {
208280 break
209281 }
210- if err := c .reportTimeSeriesMetrics (page , metricDescriptor , ch ); err != nil {
282+ if err := c .updateMetricsCacheForMetric (page , metricDescriptor ); err != nil {
211283 log .Errorf ("Error reporting Time Series metrics for descriptor `%s`: %v" , metricDescriptor .Type , err )
212284 errChannel <- err
213285 break
214286 }
215287 if page .NextPageToken == "" {
216288 break
217289 }
290+ pageNumber ++
218291 timeSeriesListCall .PageToken (page .NextPageToken )
219292 }
220- }(metricDescriptor , ch )
293+
294+ elapsed := time .Since (start )
295+ log .Debugf ("Took %s to retrieve %v pages for metric descriptor %s" , elapsed , pageNumber + 1 , metricDescriptor .Type )
296+
297+ }(metricDescriptor )
221298 }
222299
223300 wg .Wait ()
@@ -257,18 +334,15 @@ func (c *MonitoringCollector) reportMonitoringMetrics(ch chan<- prometheus.Metri
257334 return <- errChannel
258335}
259336
260- func (c * MonitoringCollector ) reportTimeSeriesMetrics (
337+ func (c * MonitoringCollector ) updateMetricsCacheForMetric (
261338 page * monitoring.ListTimeSeriesResponse ,
262- metricDescriptor * monitoring.MetricDescriptor ,
263- ch chan <- prometheus.Metric ,
264- ) error {
339+ metricDescriptor * monitoring.MetricDescriptor ) error {
265340 var metricValue float64
266341 var metricValueType prometheus.ValueType
267342 var newestTSPoint * monitoring.Point
268343
269344 timeSeriesMetrics := & TimeSeriesMetrics {
270345 metricDescriptor : metricDescriptor ,
271- ch : ch ,
272346 fillMissingLabels : c .collectorFillMissingLabels ,
273347 constMetrics : make (map [string ][]ConstMetric ),
274348 histogramMetrics : make (map [string ][]HistogramMetric ),
@@ -354,7 +428,7 @@ func (c *MonitoringCollector) reportTimeSeriesMetrics(
354428
355429 timeSeriesMetrics .CollectNewConstMetric (timeSeries , labelKeys , metricValueType , metricValue , labelValues )
356430 }
357- timeSeriesMetrics . Complete ( )
431+ c . cache . putMetric ( metricDescriptor . Type , timeSeriesMetrics )
358432 return nil
359433}
360434
0 commit comments