diff --git a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml index 246a5af6b0..2f1a7433fa 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml @@ -4,18 +4,18 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster has no standby replicas! description: |- - CloudNativePG Cluster "{{ .labels.job }}" has no ready standby replicas. Your cluster at a severe + CloudNativePG Cluster "{{ .labels.job }}" has no ready standby replicas. The cluster is at a severe risk of data loss and downtime if the primary instance fails. The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint - will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. + will fail. The `-r` endpoint is operating at reduced capacity and all traffic is being served by the primary. This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less instances. The replaced instance may need some time to catch-up with the cluster primary instance. - This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this + This alarm will be constantly triggered if your cluster is configured to run with only 1 instance. In this case you may want to silence it. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 1 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml index 736ddf393d..79c43cde31 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml @@ -6,14 +6,14 @@ annotations: description: |- CloudNativePG Cluster "{{ .labels.job }}" has only {{ .value }} standby replicas, putting your cluster at risk if another instance fails. The cluster is still able to operate normally, although - the `-ro` and `-r` endpoints operate at reduced capacity. + the `-ro` and `-r` endpoints are operating at reduced capacity. This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may need some time to catch-up with the cluster primary instance. This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. In this case you may want to silence it. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 2 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml index df13ce3b37..2843168f68 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml @@ -6,7 +6,7 @@ annotations: description: |- CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml index 73cc783925..f539236770 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml @@ -6,7 +6,7 @@ annotations: description: |- CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml index aafcfab1e2..07cd4ff32a 100644 --- a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml +++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml @@ -8,7 +8,7 @@ annotations: instances on the same node {{ .labels.node }}. A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | count by (node) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml index ea1c383458..5baf144370 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml @@ -4,8 +4,8 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space! description: |- - CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! Current disk space usage is {{ .value }}% of the total capacity. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) * 100 > 90 OR max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) * 100 > 90 OR diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml index 4b7e3eaa2e..dfbbddb2c1 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml @@ -4,8 +4,8 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space. description: |- - CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs. Current disk space usage is {{ .value }}% of the total capacity. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) * 100 > 70 OR max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) * 100 > 70 OR diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml index 4206c02f3c..f9e1361201 100644 --- a/charts/cluster/prometheus_rules/cluster-offline.yaml +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -8,7 +8,7 @@ annotations: Having an offline cluster means your applications will not be able to access the database, leading to potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | (count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0 for: 5m diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml index 41fa4002ae..e253944ec4 100644 --- a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml @@ -7,7 +7,7 @@ annotations: CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone. A disaster in one availability zone will lead to a potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/{{ $alert }}.md expr: | {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 for: 5m diff --git a/charts/cluster/templates/NOTES.txt b/charts/cluster/templates/NOTES.txt index a0cc2fef3d..c0013d418a 100644 --- a/charts/cluster/templates/NOTES.txt +++ b/charts/cluster/templates/NOTES.txt @@ -8,9 +8,9 @@ {{ if .Release.IsInstall }} -The {{ include "cluster.color-info" (include "cluster.fullname" .) }} has been installed successfully. +The {{ include "cluster.color-info" (include "cluster.fullname" .) }} cluster has been installed successfully. {{ else if .Release.IsUpgrade }} -The {{ include "cluster.color-info" (include "cluster.fullname" .) }} has been upgraded successfully. +The {{ include "cluster.color-info" (include "cluster.fullname" .) }} cluster has been upgraded successfully. {{ end }} ██████ ██ ██ ████ ██ ██ ██ ███████ ████████