Skip to content

Commit 3d3ed34

Browse files
committed
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics, is in the Admin Guide in a new section, "Reporting Metrics to StatsD". An optional "metric prefix" may be configured which will be prepended to every metric name sent to StatsD. Here is the rationale for doing a deep integration like this versus only sending metrics to StatsD in middleware. It's the only way to report some internal activities of Swift in a real-time manner. So to have one way of reporting to StatsD and one place/style of configuration, even some things (like, say, timing of PUT requests into the proxy-server) which could be logged via middleware are consistently logged the same way (deep integration via the logger delegate methods). When log_statsd_host is configured, get_logger() injects a swift.common.utils.StatsdClient object into the logger as logger.statsd_client. Then a set of delegate methods on LogAdapter either pass through to the StatsdClient object or become no-ops. This allows StatsD logging to look like: self.logger.increment('some.metric.here') and do the right thing in all cases and with no messy conditional logic. I wanted to use the pystatsd module for the StatsD client, but the version on PyPi is lagging the git repo (and is missing both the prefix functionality and timing_since() method). So I wrote my swift.common.utils.StatsdClient. The interface is the same as pystatsd.Client, but the code was written from scratch. It's pretty simple, and the tests I added cover it. This also frees Swift from an optional dependency on the pystatsd module, making this feature easier to enable. There's test coverage for the new code and all existing tests continue to pass. Refactored out _one_audit_pass() method in swift/account/auditor.py and swift/container/auditor.py. Fixed some misc. PEP8 violations. Misc test cleanups and refactorings (particularly the way "fake logging" is handled). Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
1 parent 86f37c4 commit 3d3ed34

32 files changed

+1412
-253
lines changed

doc/source/admin_guide.rst

+436
Large diffs are not rendered by default.

doc/source/conf.py

+23-16
Original file line numberDiff line numberDiff line change
@@ -18,36 +18,42 @@
1818
# Swift documentation build configuration file, created by
1919
# sphinx-quickstart on Tue May 18 13:50:15 2010.
2020
#
21-
# This file is execfile()d with the current directory set to its containing dir.
21+
# This file is execfile()d with the current directory set to its containing
22+
# dir.
2223
#
2324
# Note that not all possible configuration values are present in this
2425
# autogenerated file.
2526
#
2627
# All configuration values have a default; values that are commented out
2728
# serve to show the default.
2829

29-
import sys, os
30+
import sys
31+
import os
3032

3133
# If extensions (or modules to document with autodoc) are in another directory,
3234
# add these directories to sys.path here. If the directory is relative to the
3335
# documentation root, use os.path.abspath to make it absolute, like shown here.
34-
sys.path.append([os.path.abspath('../swift'), os.path.abspath('..'), os.path.abspath('../bin')])
36+
sys.path.append([os.path.abspath('../swift'), os.path.abspath('..'),
37+
os.path.abspath('../bin')])
3538

36-
# -- General configuration -----------------------------------------------------
39+
# -- General configuration ----------------------------------------------------
3740

38-
# Add any Sphinx extension module names here, as strings. They can be extensions
39-
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
40-
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig']
41+
# Add any Sphinx extension module names here, as strings. They can be
42+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
43+
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx',
44+
'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath',
45+
'sphinx.ext.ifconfig']
4146
todo_include_todos = True
4247

4348
# Add any paths that contain templates here, relative to this directory.
44-
# Changing the path so that the Hudson build output contains GA code and the source
45-
# docs do not contain the code so local, offline sphinx builds are "clean."
49+
# Changing the path so that the Hudson build output contains GA code and the
50+
# source docs do not contain the code so local, offline sphinx builds are
51+
# "clean."
4652
templates_path = []
4753
if os.getenv('HUDSON_PUBLISH_DOCS'):
48-
templates_path = ['_ga', '_templates']
54+
templates_path = ['_ga', '_templates']
4955
else:
50-
templates_path = ['_templates']
56+
templates_path = ['_templates']
5157

5258
# The suffix of source filenames.
5359
source_suffix = '.rst'
@@ -89,7 +95,8 @@
8995
# for source files.
9096
exclude_trees = []
9197

92-
# The reST default role (used for this markup: `text`) to use for all documents.
98+
# The reST default role (used for this markup: `text`) to use for all
99+
# documents.
93100
#default_role = None
94101

95102
# If true, '()' will be appended to :func: etc. cross-reference text.
@@ -110,7 +117,7 @@
110117
modindex_common_prefix = ['swift.']
111118

112119

113-
# -- Options for HTML output ---------------------------------------------------
120+
# -- Options for HTML output -----------------------------------------------
114121

115122
# The theme to use for HTML and HTML Help pages. Major themes that come with
116123
# Sphinx are currently 'default' and 'sphinxdoc'.
@@ -188,7 +195,7 @@
188195
htmlhelp_basename = 'swiftdoc'
189196

190197

191-
# -- Options for LaTeX output --------------------------------------------------
198+
# -- Options for LaTeX output -------------------------------------------------
192199

193200
# The paper size ('letter' or 'a4').
194201
#latex_paper_size = 'letter'
@@ -197,7 +204,8 @@
197204
#latex_font_size = '10pt'
198205

199206
# Grouping the document tree into LaTeX files. List of tuples
200-
# (source start file, target name, title, author, documentclass [howto/manual]).
207+
# (source start file, target name, title, author, documentclass
208+
# [howto/manual]).
201209
latex_documents = [
202210
('index', 'Swift.tex', u'Swift Documentation',
203211
u'Swift Team', 'manual'),
@@ -224,4 +232,3 @@
224232
intersphinx_mapping = {'python': ('http://docs.python.org/', None),
225233
'nova': ('http://nova.openstack.org', None),
226234
'glance': ('http://glance.openstack.org', None)}
227-

etc/account-server.conf-sample

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
# log_name = swift
1212
# log_facility = LOG_LOCAL0
1313
# log_level = INFO
14+
# You can enable default statsD logging here and/or override it in sections
15+
# below:
16+
# log_statsd_host = localhost
17+
# log_statsd_port = 8125
18+
# log_statsd_default_sample_rate = 1
19+
# log_statsd_metric_prefix =
1420
# Normally Swift will try to preallocate disk space for new SQLite databases to
1521
# decrease fragmentation (at the cost of disk usage). You may turn this feature
1622
# off here.

etc/container-server.conf-sample

+6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
# log_name = swift
1515
# log_facility = LOG_LOCAL0
1616
# log_level = INFO
17+
# You can enable default statsD logging here and/or override it in sections
18+
# below:
19+
# log_statsd_host = localhost
20+
# log_statsd_port = 8125
21+
# log_statsd_default_sample_rate = 1
22+
# log_statsd_metric_prefix =
1723
# Normally Swift will try to preallocate disk space for new SQLite databases to
1824
# decrease fragmentation (at the cost of disk usage). You may turn this feature
1925
# off here.

etc/object-expirer.conf-sample

+5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
# log_name = swift
66
# log_facility = LOG_LOCAL0
77
# log_level = INFO
8+
# You can enable default statsD logging here if you want:
9+
# log_statsd_host = localhost
10+
# log_statsd_port = 8125
11+
# log_statsd_default_sample_rate = 1
12+
# log_statsd_metric_prefix =
813

914
[object-expirer]
1015
# interval = 300

etc/object-server.conf-sample

+6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
# log_name = swift
1313
# log_facility = LOG_LOCAL0
1414
# log_level = INFO
15+
# You can enable default statsD logging here and/or override it in sections
16+
# below:
17+
# log_statsd_host = localhost
18+
# log_statsd_port = 8125
19+
# log_statsd_default_sample_rate = 1
20+
# log_statsd_metric_prefix =
1521

1622
[pipeline:main]
1723
pipeline = recon object-server

etc/proxy-server.conf-sample

+6
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
# log_name = swift
1414
# log_facility = LOG_LOCAL0
1515
# log_level = INFO
16+
# You can enable default statsD logging here and/or override it in sections
17+
# below:
18+
# log_statsd_host = localhost
19+
# log_statsd_port = 8125
20+
# log_statsd_default_sample_rate = 1
21+
# log_statsd_metric_prefix =
1622

1723
[pipeline:main]
1824
pipeline = catch_errors healthcheck cache tempauth proxy-server

swift/account/auditor.py

+25-30
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,34 @@ def __init__(self, conf):
4141
swift.common.db.DB_PREALLOCATION = \
4242
conf.get('db_preallocation', 't').lower() in TRUE_VALUES
4343

44+
def _one_audit_pass(self, reported):
45+
all_locs = audit_location_generator(self.devices,
46+
account_server.DATADIR, mount_check=self.mount_check,
47+
logger=self.logger)
48+
for path, device, partition in all_locs:
49+
self.account_audit(path)
50+
if time.time() - reported >= 3600: # once an hour
51+
self.logger.info(_('Since %(time)s: Account audits: '
52+
'%(passed)s passed audit, %(failed)s failed audit'),
53+
{'time': time.ctime(reported),
54+
'passed': self.account_passes,
55+
'failed': self.account_failures})
56+
reported = time.time()
57+
self.account_passes = 0
58+
self.account_failures = 0
59+
return reported
60+
4461
def run_forever(self, *args, **kwargs):
4562
"""Run the account audit until stopped."""
4663
reported = time.time()
4764
time.sleep(random() * self.interval)
4865
while True:
49-
self.logger.info(_('Begin account audit pass'))
66+
self.logger.info(_('Begin account audit pass.'))
5067
begin = time.time()
5168
try:
52-
all_locs = audit_location_generator(self.devices,
53-
account_server.DATADIR, mount_check=self.mount_check,
54-
logger=self.logger)
55-
for path, device, partition in all_locs:
56-
self.account_audit(path)
57-
if time.time() - reported >= 3600: # once an hour
58-
self.logger.info(_('Since %(time)s: Account audits: '
59-
'%(passed)s passed audit, %(failed)s failed audit'),
60-
{'time': time.ctime(reported),
61-
'passed': self.account_passes,
62-
'failed': self.account_failures})
63-
reported = time.time()
64-
self.account_passes = 0
65-
self.account_failures = 0
69+
reported = self._one_audit_pass(reported)
6670
except (Exception, Timeout):
71+
self.logger.increment('errors')
6772
self.logger.exception(_('ERROR auditing'))
6873
elapsed = time.time() - begin
6974
if elapsed < self.interval:
@@ -75,21 +80,7 @@ def run_once(self, *args, **kwargs):
7580
"""Run the account audit once."""
7681
self.logger.info(_('Begin account audit "once" mode'))
7782
begin = reported = time.time()
78-
all_locs = audit_location_generator(self.devices,
79-
account_server.DATADIR,
80-
mount_check=self.mount_check,
81-
logger=self.logger)
82-
for path, device, partition in all_locs:
83-
self.account_audit(path)
84-
if time.time() - reported >= 3600: # once an hour
85-
self.logger.info(_('Since %(time)s: Account audits: '
86-
'%(passed)s passed audit, %(failed)s failed audit'),
87-
{'time': time.ctime(reported),
88-
'passed': self.account_passes,
89-
'failed': self.account_failures})
90-
reported = time.time()
91-
self.account_passes = 0
92-
self.account_failures = 0
83+
self._one_audit_pass(reported)
9384
elapsed = time.time() - begin
9485
self.logger.info(
9586
_('Account audit "once" mode completed: %.02fs'), elapsed)
@@ -100,15 +91,19 @@ def account_audit(self, path):
10091
10192
:param path: the path to an account db
10293
"""
94+
start_time = time.time()
10395
try:
10496
if not path.endswith('.db'):
10597
return
10698
broker = AccountBroker(path)
10799
if not broker.is_deleted():
108100
info = broker.get_info()
101+
self.logger.increment('passes')
109102
self.account_passes += 1
110103
self.logger.debug(_('Audit passed for %s') % broker.db_file)
111104
except (Exception, Timeout):
105+
self.logger.increment('failures')
112106
self.account_failures += 1
113107
self.logger.exception(_('ERROR Could not get account info %s'),
114108
(broker.db_file))
109+
self.logger.timing_since('timing', start_time)

swift/account/reaper.py

+20
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def run_once(self, *args, **kwargs):
119119
for device in os.listdir(self.devices):
120120
if self.mount_check and \
121121
not os.path.ismount(os.path.join(self.devices, device)):
122+
self.logger.increment('errors')
122123
self.logger.debug(
123124
_('Skipping %s as it is not mounted'), device)
124125
continue
@@ -162,6 +163,7 @@ def reap_device(self, device):
162163
if fname.endswith('.ts'):
163164
break
164165
elif fname.endswith('.db'):
166+
self.start_time = time()
165167
broker = \
166168
AccountBroker(os.path.join(hsh_path, fname))
167169
if broker.is_status_deleted() and \
@@ -262,6 +264,7 @@ def reap_account(self, broker, partition, nodes):
262264
log = log[:-2]
263265
log += _(', elapsed: %.02fs') % (time() - begin)
264266
self.logger.info(log)
267+
self.logger.timing_since('timing', self.start_time)
265268
return True
266269

267270
def reap_container(self, account, account_partition, account_nodes,
@@ -313,12 +316,15 @@ def reap_container(self, account, account_partition, account_nodes,
313316
response_timeout=self.node_timeout)[1]
314317
self.stats_return_codes[2] = \
315318
self.stats_return_codes.get(2, 0) + 1
319+
self.logger.increment('return_codes.2')
316320
except ClientException, err:
317321
if self.logger.getEffectiveLevel() <= DEBUG:
318322
self.logger.exception(
319323
_('Exception with %(ip)s:%(port)s/%(device)s'), node)
320324
self.stats_return_codes[err.http_status / 100] = \
321325
self.stats_return_codes.get(err.http_status / 100, 0) + 1
326+
self.logger.increment(
327+
'return_codes.%d' % (err.http_status / 100,))
322328
if not objects:
323329
break
324330
try:
@@ -348,19 +354,26 @@ def reap_container(self, account, account_partition, account_nodes,
348354
successes += 1
349355
self.stats_return_codes[2] = \
350356
self.stats_return_codes.get(2, 0) + 1
357+
self.logger.increment('return_codes.2')
351358
except ClientException, err:
352359
if self.logger.getEffectiveLevel() <= DEBUG:
353360
self.logger.exception(
354361
_('Exception with %(ip)s:%(port)s/%(device)s'), node)
355362
failures += 1
363+
self.logger.increment('containers_failures')
356364
self.stats_return_codes[err.http_status / 100] = \
357365
self.stats_return_codes.get(err.http_status / 100, 0) + 1
366+
self.logger.increment(
367+
'return_codes.%d' % (err.http_status / 100,))
358368
if successes > failures:
359369
self.stats_containers_deleted += 1
370+
self.logger.increment('containers_deleted')
360371
elif not successes:
361372
self.stats_containers_remaining += 1
373+
self.logger.increment('containers_remaining')
362374
else:
363375
self.stats_containers_possibly_remaining += 1
376+
self.logger.increment('containers_possibly_remaining')
364377

365378
def reap_object(self, account, container, container_partition,
366379
container_nodes, obj):
@@ -399,16 +412,23 @@ def reap_object(self, account, container, container_partition,
399412
successes += 1
400413
self.stats_return_codes[2] = \
401414
self.stats_return_codes.get(2, 0) + 1
415+
self.logger.increment('return_codes.2')
402416
except ClientException, err:
403417
if self.logger.getEffectiveLevel() <= DEBUG:
404418
self.logger.exception(
405419
_('Exception with %(ip)s:%(port)s/%(device)s'), node)
406420
failures += 1
421+
self.logger.increment('objects_failures')
407422
self.stats_return_codes[err.http_status / 100] = \
408423
self.stats_return_codes.get(err.http_status / 100, 0) + 1
424+
self.logger.increment(
425+
'return_codes.%d' % (err.http_status / 100,))
409426
if successes > failures:
410427
self.stats_objects_deleted += 1
428+
self.logger.increment('objects_deleted')
411429
elif not successes:
412430
self.stats_objects_remaining += 1
431+
self.logger.increment('objects_remaining')
413432
else:
414433
self.stats_objects_possibly_remaining += 1
434+
self.logger.increment('objects_possibly_remaining')

0 commit comments

Comments
 (0)