diff --git a/CHANGELOG.md b/CHANGELOG.md index ec59595f4..94b86c54e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add alert `ClusterCrossplaneResourcesNotReady` for Crossplane resources that are critical for clusters + ### Fixed - fix capi-kubeadmconfig rule for hybrid providers diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-crossplane.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-crossplane.rules.yml new file mode 100644 index 000000000..c77d60427 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-crossplane.rules.yml @@ -0,0 +1,30 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: cluster-crossplane.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: cluster-crossplane + rules: + - alert: ClusterCrossplaneResourcesNotReady + annotations: + # Crossplane doesn't offer object names and the objects are stored on the MC, so right + # now (2025-01), we can't make this alert WC-specific. + description: '{{`Not all managed Crossplane resources of type "{{ $labels.gvk }}" on {{ $labels.cluster_id }} are ready. This could affect creation or health of workload clusters.`}}' + opsrecipe: cluster-crossplane-resources + # Match critical resources deployed by cluster-aws via aws-nth-crossplane-resources, + # cilium-crossplane-resources, ... + expr: crossplane_managed_resource_exists{gvk=~".*Kind=(Queue|QueuePolicy|Role|Rule|SecurityGroup|SecurityGroupEgressRule|SecurityGroupIngressRule|Target)"} != crossplane_managed_resource_ready{gvk=~".*Kind=(Queue|QueuePolicy|Role|Rule|SecurityGroup|SecurityGroupEgressRule|SecurityGroupIngressRule|Target)"} + for: 15m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: page + team: phoenix diff --git a/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/cluster-crossplane.rules.test.yml b/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/cluster-crossplane.rules.test.yml new file mode 100644 index 000000000..5da7fbadb --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/cluster-crossplane.rules.test.yml @@ -0,0 +1,28 @@ +rule_files: + - cluster-crossplane.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'crossplane_managed_resource_exists{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc"}' + values: "6x20" + - series: 'crossplane_managed_resource_ready{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc"}' + values: "5x20" + + alert_rule_test: + - alertname: ClusterCrossplaneResourcesNotReady + eval_time: 20m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + cluster_id: "mymc" + gvk: "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule" + severity: page + team: phoenix + exp_annotations: + description: 'Not all managed Crossplane resources of type "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule" on mymc are ready. This could affect creation or health of workload clusters.' + opsrecipe: cluster-crossplane-resources