Skip to content

Commit d858b08

Browse files
committed
add maintenance user. exporter image.
1 parent bacf4cb commit d858b08

File tree

11 files changed

+431
-44
lines changed

11 files changed

+431
-44
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ operator-yaml: operator-image
3333
cp operatorversions.json platforms/kubernetes/postgres-operator/deploy/versions.json
3434
cp jq-template.awk platforms/kubernetes/postgres-operator/deploy/jq-template.awk
3535
cd platforms/kubernetes/postgres-operator/deploy/; awk -f jq-template.awk postgres-operator.yaml.template > postgres-operator.yaml
36+
exporter-image:
37+
cd image/exporter; ./generate_image.sh
3638

3739
format:
3840
find ./ -path "./platforms/kubernetes/postgres-operator/postgres/*.py" | xargs yapf -i -vv

image/exporter/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
versions.json
2+
jq-template.awk
3+
postgres
4+
autofailover_queries.yaml

image/exporter/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Build container
2+
FROM prometheuscommunity/postgres-exporter
3+
4+
COPY autofailover_queries.yaml /etc/autofailover_queries.yaml
5+
COPY queries.yaml /etc/queries.yaml

image/exporter/autofailover.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
pg_auto_failover_async:
2+
query: "select count(*) as count from pgautofailover.node where reportedrepstate <> 'quorum' and reportedrepstate <> 'unknown'"
3+
metrics:
4+
- count:
5+
usage: "GAUGE"
6+
description: "async node number"
7+
8+
pg_auto_failover_invalid:
9+
query: "select count(*) as count from pgautofailover.node where reportedstate <> 'single' and reportedstate <> 'primary' and reportedstate <> 'secondary'"
10+
metrics:
11+
- count:
12+
usage: "GAUGE"
13+
description: "invalid status node number"
14+
15+
pg_auto_failover_not_running:
16+
query: "select count(*) as count from pgautofailover.node where reportedpgisrunning <> 't'"
17+
metrics:
18+
- count:
19+
usage: "GAUGE"
20+
description: "pg not running number"
21+
22+
pg_auto_failover_unhealth:
23+
query: "select count(*) from pgautofailover.node where health <> 1"
24+
metrics:
25+
- count:
26+
usage: "GAUGE"
27+
description: "unhealth node number"

image/exporter/generate_image.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env bash
2+
set -Eeo pipefail
3+
4+
build_image()
5+
{
6+
image=$1
7+
platform=$2
8+
9+
image_exists=$( docker image ls --format "{{.Repository}}:{{.Tag}}" | awk -v aaa=$image '{print $0} END{print aaa}' | grep -c $image )
10+
if [ "$image_exists" -ne 1 ]; then
11+
if [ "$forcebuildimage" = 1 ]; then
12+
echo "docker image $image exists, rebuilding the image ..."
13+
else
14+
echo "docker image $image exists, skiping ..."
15+
return
16+
fi
17+
fi
18+
19+
echo "build docker image $image ..."
20+
docker buildx build --no-cache -t $image --platform $platform .
21+
}
22+
23+
image=$(jq -r '.image' versions.json)
24+
if [ "$platform" = arm64 ]; then
25+
image=${image}-arm64
26+
fi
27+
28+
29+
cat queries.yaml autofailover.yaml > autofailover_queries.yaml
30+
31+
# get queries.yaml
32+
# wget https://raw.githubusercontent.com/prometheus-community/postgres_exporter/master/queries.yaml
33+
34+
build_image $image "linux/${platform}"

image/exporter/queries.yaml

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
pg_replication:
2+
query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
3+
master: true
4+
metrics:
5+
- lag:
6+
usage: "GAUGE"
7+
description: "Replication lag behind master in seconds"
8+
9+
pg_postmaster:
10+
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
11+
master: true
12+
metrics:
13+
- start_time_seconds:
14+
usage: "GAUGE"
15+
description: "Time at which postmaster started"
16+
17+
pg_stat_user_tables:
18+
query: |
19+
SELECT
20+
current_database() datname,
21+
schemaname,
22+
relname,
23+
seq_scan,
24+
seq_tup_read,
25+
idx_scan,
26+
idx_tup_fetch,
27+
n_tup_ins,
28+
n_tup_upd,
29+
n_tup_del,
30+
n_tup_hot_upd,
31+
n_live_tup,
32+
n_dead_tup,
33+
n_mod_since_analyze,
34+
COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
35+
COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
36+
COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
37+
COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
38+
vacuum_count,
39+
autovacuum_count,
40+
analyze_count,
41+
autoanalyze_count
42+
FROM
43+
pg_stat_user_tables
44+
metrics:
45+
- datname:
46+
usage: "LABEL"
47+
description: "Name of current database"
48+
- schemaname:
49+
usage: "LABEL"
50+
description: "Name of the schema that this table is in"
51+
- relname:
52+
usage: "LABEL"
53+
description: "Name of this table"
54+
- seq_scan:
55+
usage: "COUNTER"
56+
description: "Number of sequential scans initiated on this table"
57+
- seq_tup_read:
58+
usage: "COUNTER"
59+
description: "Number of live rows fetched by sequential scans"
60+
- idx_scan:
61+
usage: "COUNTER"
62+
description: "Number of index scans initiated on this table"
63+
- idx_tup_fetch:
64+
usage: "COUNTER"
65+
description: "Number of live rows fetched by index scans"
66+
- n_tup_ins:
67+
usage: "COUNTER"
68+
description: "Number of rows inserted"
69+
- n_tup_upd:
70+
usage: "COUNTER"
71+
description: "Number of rows updated"
72+
- n_tup_del:
73+
usage: "COUNTER"
74+
description: "Number of rows deleted"
75+
- n_tup_hot_upd:
76+
usage: "COUNTER"
77+
description: "Number of rows HOT updated (i.e., with no separate index update required)"
78+
- n_live_tup:
79+
usage: "GAUGE"
80+
description: "Estimated number of live rows"
81+
- n_dead_tup:
82+
usage: "GAUGE"
83+
description: "Estimated number of dead rows"
84+
- n_mod_since_analyze:
85+
usage: "GAUGE"
86+
description: "Estimated number of rows changed since last analyze"
87+
- last_vacuum:
88+
usage: "GAUGE"
89+
description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
90+
- last_autovacuum:
91+
usage: "GAUGE"
92+
description: "Last time at which this table was vacuumed by the autovacuum daemon"
93+
- last_analyze:
94+
usage: "GAUGE"
95+
description: "Last time at which this table was manually analyzed"
96+
- last_autoanalyze:
97+
usage: "GAUGE"
98+
description: "Last time at which this table was analyzed by the autovacuum daemon"
99+
- vacuum_count:
100+
usage: "COUNTER"
101+
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
102+
- autovacuum_count:
103+
usage: "COUNTER"
104+
description: "Number of times this table has been vacuumed by the autovacuum daemon"
105+
- analyze_count:
106+
usage: "COUNTER"
107+
description: "Number of times this table has been manually analyzed"
108+
- autoanalyze_count:
109+
usage: "COUNTER"
110+
description: "Number of times this table has been analyzed by the autovacuum daemon"
111+
112+
pg_statio_user_tables:
113+
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
114+
metrics:
115+
- datname:
116+
usage: "LABEL"
117+
description: "Name of current database"
118+
- schemaname:
119+
usage: "LABEL"
120+
description: "Name of the schema that this table is in"
121+
- relname:
122+
usage: "LABEL"
123+
description: "Name of this table"
124+
- heap_blks_read:
125+
usage: "COUNTER"
126+
description: "Number of disk blocks read from this table"
127+
- heap_blks_hit:
128+
usage: "COUNTER"
129+
description: "Number of buffer hits in this table"
130+
- idx_blks_read:
131+
usage: "COUNTER"
132+
description: "Number of disk blocks read from all indexes on this table"
133+
- idx_blks_hit:
134+
usage: "COUNTER"
135+
description: "Number of buffer hits in all indexes on this table"
136+
- toast_blks_read:
137+
usage: "COUNTER"
138+
description: "Number of disk blocks read from this table's TOAST table (if any)"
139+
- toast_blks_hit:
140+
usage: "COUNTER"
141+
description: "Number of buffer hits in this table's TOAST table (if any)"
142+
- tidx_blks_read:
143+
usage: "COUNTER"
144+
description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
145+
- tidx_blks_hit:
146+
usage: "COUNTER"
147+
description: "Number of buffer hits in this table's TOAST table indexes (if any)"
148+
149+
# WARNING: This set of metrics can be very expensive on a busy server as every unique query executed will create an additional time series
150+
pg_stat_statements:
151+
query: "SELECT t2.rolname, t3.datname, queryid, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 JOIN pg_roles t2 ON (t1.userid=t2.oid) JOIN pg_database t3 ON (t1.dbid=t3.oid) WHERE t2.rolname != 'rdsadmin'"
152+
master: true
153+
metrics:
154+
- rolname:
155+
usage: "LABEL"
156+
description: "Name of user"
157+
- datname:
158+
usage: "LABEL"
159+
description: "Name of database"
160+
- queryid:
161+
usage: "LABEL"
162+
description: "Query ID"
163+
- calls:
164+
usage: "COUNTER"
165+
description: "Number of times executed"
166+
- total_time_seconds:
167+
usage: "COUNTER"
168+
description: "Total time spent in the statement, in milliseconds"
169+
- min_time_seconds:
170+
usage: "GAUGE"
171+
description: "Minimum time spent in the statement, in milliseconds"
172+
- max_time_seconds:
173+
usage: "GAUGE"
174+
description: "Maximum time spent in the statement, in milliseconds"
175+
- mean_time_seconds:
176+
usage: "GAUGE"
177+
description: "Mean time spent in the statement, in milliseconds"
178+
- stddev_time_seconds:
179+
usage: "GAUGE"
180+
description: "Population standard deviation of time spent in the statement, in milliseconds"
181+
- rows:
182+
usage: "COUNTER"
183+
description: "Total number of rows retrieved or affected by the statement"
184+
- shared_blks_hit:
185+
usage: "COUNTER"
186+
description: "Total number of shared block cache hits by the statement"
187+
- shared_blks_read:
188+
usage: "COUNTER"
189+
description: "Total number of shared blocks read by the statement"
190+
- shared_blks_dirtied:
191+
usage: "COUNTER"
192+
description: "Total number of shared blocks dirtied by the statement"
193+
- shared_blks_written:
194+
usage: "COUNTER"
195+
description: "Total number of shared blocks written by the statement"
196+
- local_blks_hit:
197+
usage: "COUNTER"
198+
description: "Total number of local block cache hits by the statement"
199+
- local_blks_read:
200+
usage: "COUNTER"
201+
description: "Total number of local blocks read by the statement"
202+
- local_blks_dirtied:
203+
usage: "COUNTER"
204+
description: "Total number of local blocks dirtied by the statement"
205+
- local_blks_written:
206+
usage: "COUNTER"
207+
description: "Total number of local blocks written by the statement"
208+
- temp_blks_read:
209+
usage: "COUNTER"
210+
description: "Total number of temp blocks read by the statement"
211+
- temp_blks_written:
212+
usage: "COUNTER"
213+
description: "Total number of temp blocks written by the statement"
214+
- blk_read_time_seconds:
215+
usage: "COUNTER"
216+
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
217+
- blk_write_time_seconds:
218+
usage: "COUNTER"
219+
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
220+
221+
pg_process_idle:
222+
query: |
223+
WITH
224+
metrics AS (
225+
SELECT
226+
application_name,
227+
SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
228+
COUNT(*) AS process_idle_seconds_count
229+
FROM pg_stat_activity
230+
WHERE state = 'idle'
231+
GROUP BY application_name
232+
),
233+
buckets AS (
234+
SELECT
235+
application_name,
236+
le,
237+
SUM(
238+
CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
239+
THEN 1
240+
ELSE 0
241+
END
242+
)::bigint AS bucket
243+
FROM
244+
pg_stat_activity,
245+
UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
246+
GROUP BY application_name, le
247+
ORDER BY application_name, le
248+
)
249+
SELECT
250+
application_name,
251+
process_idle_seconds_sum as seconds_sum,
252+
process_idle_seconds_count as seconds_count,
253+
ARRAY_AGG(le) AS seconds,
254+
ARRAY_AGG(bucket) AS seconds_bucket
255+
FROM metrics JOIN buckets USING (application_name)
256+
GROUP BY 1, 2, 3
257+
metrics:
258+
- application_name:
259+
usage: "LABEL"
260+
description: "Application Name"
261+
- seconds:
262+
usage: "HISTOGRAM"
263+
description: "Idle time of server processes"

platforms/kubernetes/postgres-operator/deploy/postgres-operator.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ spec:
188188
items:
189189
type: object
190190
x-kubernetes-preserve-unknown-fields: true
191+
maintenance:
192+
type: array
193+
items:
194+
type: object
195+
x-kubernetes-preserve-unknown-fields: true
191196
normal:
192197
type: array
193198
items:

platforms/kubernetes/postgres-operator/deploy/postgres-operator.yaml.template

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ spec:
188188
items:
189189
type: object
190190
x-kubernetes-preserve-unknown-fields: true
191+
maintenance:
192+
type: array
193+
items:
194+
type: object
195+
x-kubernetes-preserve-unknown-fields: true
191196
normal:
192197
type: array
193198
items:

0 commit comments

Comments
 (0)