Skip to content

Commit

Permalink
Configure Prometheus scrape configuration dinamically (#391)
Browse files Browse the repository at this point in the history
* Configure Prometheus scrape configuration dinamically

In a scenario where a machine does not have the hardware to
install one exporter, we don't need to configure it on COS.

When it's not dinamic, grafana-agent might think that a target is
down when actually the exporter is just not installed.

Fix: #287
  • Loading branch information
gabrielcocenza authored Feb 12, 2025
1 parent 0d1ee3e commit 52a34ea
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 17 deletions.
62 changes: 45 additions & 17 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""Charm the application."""

import logging
from typing import Any, List, Set, Tuple
from typing import Any, Dict, List, Set, Tuple

import ops
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
Expand All @@ -28,22 +28,6 @@ def __init__(self, *args: Any) -> None:
super().__init__(*args)
self.hw_tool_helper = HWToolHelper()

# Add refresh_events to COSAgentProvider to update relation data when
# config changed (default behavior) and upgrade charm. This is useful
# for updating alert rules.
self.cos_agent_provider = COSAgentProvider(
self,
refresh_events=[self.on.config_changed, self.on.upgrade_charm],
metrics_endpoints=[
{"path": "/metrics", "port": int(self.model.config["hardware-exporter-port"])},
{"path": "/metrics", "port": int(self.model.config["smartctl-exporter-port"])},
{"path": "/metrics", "port": 9400},
],
# Setting scrape_timeout as collect_timeout in the `duration` format specified in
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration
scrape_configs=[{"scrape_timeout": f"{int(self.model.config['collect-timeout'])}s"}],
)

self._stored.set_default(
# resource_installed is a flag that tracks the installation state for
# the juju resources and also the different exporters
Expand All @@ -64,6 +48,15 @@ def __init__(self, *args: Any) -> None:
)
self.framework.observe(self.on.redetect_hardware_action, self._on_redetect_hardware)

# Add refresh_events to COSAgentProvider to update relation data when
# config changed (default behavior) and upgrade charm. This is useful
# for updating alert rules.
self.cos_agent_provider = COSAgentProvider(
self,
refresh_events=[self.on.config_changed, self.on.upgrade_charm],
scrape_configs=self._scrape_config,
)

self.num_cos_agent_relations = self.get_num_cos_agent_relations("cos-agent")

@property
Expand Down Expand Up @@ -298,6 +291,41 @@ def validate_configs(self) -> Tuple[bool, str]:

return True, "Charm config is valid."

def _scrape_config(self) -> List[Dict[str, Any]]:
"""Generate the scrape config as needed."""
# Setting scrape_timeout as collect_timeout in the `duration` format specified in
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration
scrape_config: List[Dict[str, Any]] = [
{"scrape_timeout": f"{self.model.config['collect-timeout']}s"}
]

for exporter in self.exporters:
if isinstance(exporter, HardwareExporter):
port = self.model.config["hardware-exporter-port"]
scrape_config.append(
{
"metrics_path": "/metrics",
"static_configs": [{"targets": [f"localhost:{port}"]}],
}
)
if isinstance(exporter, SmartCtlExporter):
port = self.model.config["smartctl-exporter-port"]
scrape_config.append(
{
"metrics_path": "/metrics",
"static_configs": [{"targets": [f"localhost:{port}"]}],
}
)
if isinstance(exporter, DCGMExporter):
port = 9400
scrape_config.append(
{
"metrics_path": "/metrics",
"static_configs": [{"targets": [f"localhost:{port}"]}],
}
)
return scrape_config

@property
def cos_agent_related(self) -> bool:
"""Return True if cos-agent relation is present."""
Expand Down
36 changes: 36 additions & 0 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import unittest
from pathlib import Path
from unittest import mock

import ops
Expand Down Expand Up @@ -796,3 +797,38 @@ def test_stored_tools_remove_legacy_smartctl(self):
self.harness.begin()
self.harness.charm._stored.stored_tools = {"smartctl"}
assert self.harness.charm.stored_tools == set()

@mock.patch("service.get_bmc_address")
@mock.patch("charm.HardwareObserverCharm.exporters", new_callable=mock.PropertyMock)
def test_scrape_config(self, mock_exporters, _):
self.harness.begin()
config = self.harness.charm.model.config
hw_exporter = HardwareExporter(Path(), config, set())
smartctl_exporter = SmartCtlExporter(config)
dcgm_exporter = DCGMExporter(config)

mock_exporters.return_value = [hw_exporter, smartctl_exporter, dcgm_exporter]

assert self.harness.charm._scrape_config() == [
{"scrape_timeout": "10s"},
{"metrics_path": "/metrics", "static_configs": [{"targets": ["localhost:10200"]}]},
{"metrics_path": "/metrics", "static_configs": [{"targets": ["localhost:10201"]}]},
{"metrics_path": "/metrics", "static_configs": [{"targets": ["localhost:9400"]}]},
]

@mock.patch("charm.HardwareObserverCharm.exporters", new_callable=mock.PropertyMock)
def test_scrape_config_no_specific_hardware(
self,
mock_exporters,
):
# simulate a hardware that does not have NVIDIA or tools to install hw exporter
self.harness.begin()
config = self.harness.charm.model.config
smartctl_exporter = SmartCtlExporter(config)

mock_exporters.return_value = [smartctl_exporter]

assert self.harness.charm._scrape_config() == [
{"scrape_timeout": "10s"},
{"metrics_path": "/metrics", "static_configs": [{"targets": ["localhost:10201"]}]},
]

0 comments on commit 52a34ea

Please sign in to comment.