Skip to content

Commit ea6b865

Browse files
Add mxid and xid metrics
1 parent 6c16a8b commit ea6b865

File tree

6 files changed

+133
-33
lines changed

6 files changed

+133
-33
lines changed

etc/postgresql-metrics/default/postgresql-metrics.yml

+6-8
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ ffwd:
3838
# Each entry must be a tuple with the function name, and a time interval in seconds
3939
# to call that metrics function.
4040
#
41-
# db_functions: Functions taking DB connection and returning a list of metrics,
42-
# called once per each database in cluster.
41+
# db_functions: Functions called once per each database in cluster.
4342
db_functions:
4443
- ["get_stats_disk_usage_for_database", 180]
4544
- ["get_stats_tx_rate_for_database", 60]
@@ -52,15 +51,14 @@ db_functions:
5251
# replication status relies on `pg_stat_wal_receiver`, which is only available on postgres 9.6+
5352
# - ["get_stats_incoming_replication_status", 30]
5453

55-
# global_db_functions: Functions taking DB connection and returning a list of metrics,
56-
# called once per the whole database cluster.
54+
# global_db_functions: Functions called once per the whole database cluster.
5755
global_db_functions:
5856
- ["get_stats_client_connections", 60]
5957
- ["get_stats_lock_statistics", 60]
6058
- ["get_stats_heap_hit_statistics", 60]
6159
- ["get_stats_replication_delays", 60]
62-
63-
# data_dir_functions: Functions taking a file path to Postgres data dir and returning
64-
# a list of metrics, called once per the whole database cluster.
65-
data_dir_functions:
6660
- ["get_stats_wal_file_amount", 180]
61+
- ["get_multixact_members_usage_ratio", 60]
62+
- ["get_multixact_members_per_mxid", 60]
63+
- ["get_multixact_remaining_ratio", 60]
64+
- ["get_xid_remaining_ratio", 60]

postgresql_metrics/default_metrics.py

+23
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,29 @@ def metric_sec_since_oldest_xact_start(database_name, value):
116116
'unit': 's'})
117117

118118

119+
def metric_xid_remaining_ratio(value):
120+
return create_default_metric(value,
121+
{'what': 'xid-remaining',
122+
'unit': '%'})
123+
124+
125+
def metric_multixact_remaining_ratio(value):
126+
return create_default_metric(value,
127+
{'what': 'mxid-remaining',
128+
'unit': '%'})
129+
130+
131+
def metric_multixact_members_per_mxid(value):
132+
return create_default_metric(value,
133+
{'what': 'multixact-members-per-mxid',
134+
'unit': 'members/id'})
135+
136+
137+
def metric_multixact_members_remaining_ratio(value):
138+
return create_default_metric(value,
139+
{'what': 'multixact-members-remaining',
140+
'unit': '%'})
141+
119142
def metric_wal_file_amount(value):
120143
return create_default_metric(value,
121144
{'what': 'wal-file-amount',

postgresql_metrics/localhost_postgres_stats.py

+12
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@
2424
LOG = get_logger()
2525

2626

27+
def get_multixact_member_files(data_dir):
28+
try:
29+
members_dir = os.path.join(data_dir, "pg_multixact", "members")
30+
if os.path.isdir(members_dir):
31+
return len([f for f in os.listdir(members_dir) if os.path.isfile(os.path.join(members_dir, f))])
32+
else:
33+
LOG.exception(f"Missing pg_multixact/members directory in data_dir: {data_dir}")
34+
except OSError:
35+
LOG.exception('Failed accessing multixact member files in: {data_dir}. Is data dir readable by user?')
36+
return 0
37+
38+
2739
def get_amount_of_wal_files(data_dir):
2840
amount_of_wal_files = 0
2941
try:

postgresql_metrics/metrics_gatherer.py

+61-13
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,13 @@
4343
metric_replication_delay_bytes,
4444
metric_wal_file_amount,
4545
metric_incoming_replication_running,
46+
metric_multixact_members_per_mxid,
47+
metric_multixact_remaining_ratio,
48+
metric_xid_remaining_ratio,
49+
metric_multixact_members_remaining_ratio,
4650
)
4751

48-
from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files
52+
from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files, get_multixact_member_files
4953

5054
from postgresql_metrics.postgres_queries import (
5155
get_client_connections_amount,
@@ -60,23 +64,29 @@
6064
get_replication_delays,
6165
get_tables_with_oids_for_current_db,
6266
get_wal_receiver_status,
67+
get_max_mxid_age,
68+
get_max_xid_age,
6369
)
6470

71+
MEMBERS_PER_MEMBER_FILE = 52352
72+
MAX_MULTIXACT_MEMBERS = 2**32
73+
WRAPAROUND_LIMIT = (2**32/2) - 1
6574

6675
# Notice that all functions here are expected to return a list of metrics.
6776
# Notice also that the names of these functions should match the configuration.
6877

69-
def get_stats_client_connections(db_connection):
78+
79+
def get_stats_client_connections(_data_dir, db_connection):
7080
client_amount = get_client_connections_amount(db_connection)
7181
return [metric_client_connections(client_amount)]
7282

7383

74-
def get_stats_disk_usage_for_database(db_connection):
84+
def get_stats_disk_usage_for_database(_data_dir, db_connection):
7585
db_size = get_disk_usage_for_database(db_connection)
7686
return [metric_database_size(db_size[0], db_size[1])]
7787

7888

79-
def get_stats_tx_rate_for_database(db_connection):
89+
def get_stats_tx_rate_for_database(_data_dir, db_connection):
8090
db_name, tx_rate, tx_rollbacks = get_transaction_rate_for_database(db_connection)
8191
if tx_rate is not None:
8292
return [metric_transaction_rate(db_name, tx_rate),
@@ -85,15 +95,15 @@ def get_stats_tx_rate_for_database(db_connection):
8595
return []
8696

8797

88-
def get_stats_seconds_since_last_vacuum_per_table(db_connection):
98+
def get_stats_seconds_since_last_vacuum_per_table(_data_dir, db_connection):
8999
last_vacuums_data = get_seconds_since_last_vacuum_per_table(db_connection)
90100
metrics = []
91101
for db_name, table_name, seconds_since in last_vacuums_data:
92102
metrics.append(metric_seconds_since_last_vacuum(db_name, table_name, seconds_since))
93103
return metrics
94104

95105

96-
def get_stats_heap_hit_statistics(db_connection):
106+
def get_stats_heap_hit_statistics(_data_dir, db_connection):
97107
db_name, heap_read, heap_hit, heap_hit_ratio = get_heap_hit_statistics(db_connection)
98108
metrics = []
99109
if heap_hit_ratio is not None:
@@ -103,7 +113,7 @@ def get_stats_heap_hit_statistics(db_connection):
103113
return metrics
104114

105115

106-
def get_stats_lock_statistics(db_connection):
116+
def get_stats_lock_statistics(_data_dir, db_connection):
107117
locks_by_type, [total_locks_waiting, total_locks_granted] = get_lock_statistics(db_connection)
108118
metrics = []
109119
for lock_type, [locks_waiting, locks_granted] in locks_by_type.items():
@@ -114,15 +124,15 @@ def get_stats_lock_statistics(db_connection):
114124
return metrics
115125

116126

117-
def get_stats_oldest_transaction_timestamp(db_connection):
127+
def get_stats_oldest_transaction_timestamp(_data_dir, db_connection):
118128
db_name, sec_since_oldest_xact_start = get_oldest_transaction_timestamp(db_connection)
119129
metrics = []
120130
if sec_since_oldest_xact_start is not None:
121131
metrics.append(metric_sec_since_oldest_xact_start(db_name, sec_since_oldest_xact_start))
122132
return metrics
123133

124134

125-
def get_stats_table_bloat(db_connection):
135+
def get_stats_table_bloat(_data_dir, db_connection):
126136
tables_with_oids = get_tables_with_oids_for_current_db(db_connection)
127137
metrics = []
128138
for table_oid, table_name in tables_with_oids:
@@ -132,7 +142,7 @@ def get_stats_table_bloat(db_connection):
132142
return metrics
133143

134144

135-
def get_stats_index_hit_rates(db_connection):
145+
def get_stats_index_hit_rates(_data_dir, db_connection):
136146
index_hit_rates = get_index_hit_rates(db_connection)
137147
metrics = []
138148
for db_name, table_name, index_hit_ratio in index_hit_rates:
@@ -141,18 +151,56 @@ def get_stats_index_hit_rates(db_connection):
141151
return metrics
142152

143153

144-
def get_stats_replication_delays(db_connection):
154+
def get_stats_replication_delays(_data_dir, db_connection):
145155
replication_delays = get_replication_delays(db_connection)
146156
metrics = []
147157
for client_addr, delay_in_bytes in replication_delays:
148158
metrics.append(metric_replication_delay_bytes(client_addr, delay_in_bytes))
149159
return metrics
150160

151161

152-
def get_stats_wal_file_amount(data_dir):
162+
def _get_multixact_members(data_dir):
163+
return get_multixact_member_files(data_dir) * MEMBERS_PER_MEMBER_FILE
164+
165+
166+
def get_multixact_members_per_mxid(data_dir, db_connection):
167+
members = _get_multixact_members(data_dir)
168+
mxid_age = get_max_mxid_age(db_connection)
169+
if not mxid_age:
170+
return []
171+
members_per_id = round(members / mxid_age, 2)
172+
return [metric_multixact_members_per_mxid(members_per_id)]
173+
174+
175+
def get_multixact_members_usage_ratio(data_dir, _db_connection):
176+
members = _get_multixact_members(data_dir)
177+
ratio = members // MAX_MULTIXACT_MEMBERS
178+
percentage_remaining = (1.0 - ratio) * 100
179+
return [metric_multixact_members_remaining_ratio(percentage_remaining)]
180+
181+
182+
def get_multixact_remaining_ratio(_data_dir, db_connection):
183+
mxid_age = get_max_mxid_age(db_connection)
184+
if not mxid_age:
185+
return []
186+
ratio = mxid_age // WRAPAROUND_LIMIT
187+
percentage_remaining = (1.0 - ratio) * 100
188+
return [metric_multixact_remaining_ratio(percentage_remaining)]
189+
190+
191+
def get_xid_remaining_ratio(_data_dir, db_connection):
192+
xid_age = get_max_xid_age(db_connection)
193+
if not xid_age:
194+
return []
195+
ratio = xid_age // WRAPAROUND_LIMIT
196+
percentage_remaining = (1.0 - ratio) * 100
197+
return [metric_xid_remaining_ratio(percentage_remaining)]
198+
199+
200+
def get_stats_wal_file_amount(data_dir, _db_connection):
153201
return [metric_wal_file_amount(get_amount_of_wal_files(data_dir))]
154202

155203

156-
def get_stats_incoming_replication_status(db_connection):
204+
def get_stats_incoming_replication_status(_data_dir, db_connection):
157205
return [metric_incoming_replication_running(host, is_streaming)
158206
for host, is_streaming in get_wal_receiver_status(db_connection)]

postgresql_metrics/metrics_logic.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def _is_time_to_call_stats_func_and_update_ts(database_name, metrics_func, run_i
8282
return False
8383

8484

85-
def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_name=None):
85+
def _call_all_db_functions(db_stats_functions, db_parameters, schedule=False, db_name=None):
8686
"""Iterates through all given statistics functions, calling them with the given parameter.
8787
The db_parameter can be a database connection or a file path to Postgres data directory,
8888
depending on the statistics function to call.
@@ -100,7 +100,7 @@ def _call_all_db_functions(db_parameter, db_stats_functions, schedule=False, db_
100100
if is_call_required:
101101
try:
102102
LOG.debug('calling stats function {}', db_metrics_func.__name__)
103-
metrics.extend(db_metrics_func(db_parameter))
103+
metrics.extend(db_metrics_func(*db_parameters))
104104
except Exception:
105105
LOG.exception('failed calling stats function: ' + db_metrics_func.__name__)
106106
return metrics
@@ -123,22 +123,20 @@ def get_stats_functions_from_conf(func_key_name, conf):
123123
def get_all_stats_functions_from_conf(conf):
124124
db_functions = get_stats_functions_from_conf('db_functions', conf)
125125
global_db_functions = get_stats_functions_from_conf('global_db_functions', conf)
126-
data_dir_functions = get_stats_functions_from_conf('data_dir_functions', conf)
127-
return db_functions, global_db_functions, data_dir_functions
126+
return db_functions, global_db_functions
128127

129128

130129
def get_all_metrics_now(db_connections, conf):
131130
"""Get all the metrics immediately without any scheduling.
132131
First gets the global stats with first available database connection,
133132
and then gets the rest per database.
134133
"""
135-
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
134+
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
136135
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)
137136

138-
all_metrics = _call_all_db_functions(db_connections[0], global_db_functions)
139-
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions))
137+
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]))
140138
for db_connection in db_connections:
141-
all_metrics.extend(_call_all_db_functions(db_connection, db_functions))
139+
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection)))
142140
return all_metrics
143141

144142

@@ -147,14 +145,13 @@ def get_all_metrics_scheduled(db_connections, conf):
147145
First gets the global stats with first available database connection,
148146
and then gets the rest per database.
149147
"""
150-
db_functions, global_db_functions, data_dir_functions = get_all_stats_functions_from_conf(conf)
148+
db_functions, global_db_functions = get_all_stats_functions_from_conf(conf)
151149
data_dir = figure_out_postgres_data_dir(db_connections[0], conf)
152150

153-
all_metrics = _call_all_db_functions(db_connections[0], global_db_functions, schedule=True)
154-
all_metrics.extend(_call_all_db_functions(data_dir, data_dir_functions, schedule=True))
151+
all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]), schedule=True)
155152
for db_connection in db_connections:
156153
db_name = get_db_name_from_connection(db_connection)
157-
all_metrics.extend(_call_all_db_functions(db_connection, db_functions,
154+
all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection),
158155
schedule=True, db_name=db_name))
159156
return all_metrics
160157

postgresql_metrics/postgres_queries.py

+22
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,28 @@ def get_oldest_transaction_timestamp(conn):
195195
return None, None
196196

197197

198+
def get_max_mxid_age(conn):
199+
# `mxid_age` is only available on postgres 9.5 and newer
200+
if conn.server_version < 95000:
201+
LOG.error("Unable to check mxid_age on versions of postgres below 9.5")
202+
return None
203+
sql = "SELECT max(mxid_age(relminmxid)) FROM pg_class WHERE relminmxid <> '0'"
204+
results = query(conn, sql)
205+
if not results:
206+
return None
207+
mxid_age, = results[0]
208+
return int(mxid_age)
209+
210+
211+
def get_max_xid_age(conn):
212+
sql = "SELECT max(age(datfrozenxid)) FROM pg_database"
213+
results = query(conn, sql)
214+
if not results:
215+
return None
216+
xid_age, = results[0]
217+
return int(xid_age)
218+
219+
198220
def get_replication_delays(conn):
199221
sql = ("SELECT client_addr, "
200222
"pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff "

0 commit comments

Comments
 (0)