Skip to content

Commit 9b299b9

Browse files
redboTarmac
authored and
Tarmac
committed
Make obj/replicator timeouts configurable
2 parents f99577e + e5b48be commit 9b299b9

File tree

3 files changed

+48
-28
lines changed

3 files changed

+48
-28
lines changed

etc/object-server.conf-sample

+9-2
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,15 @@ use = egg:swift#object
3232
# daemonize = on
3333
# run_pause = 30
3434
# concurrency = 1
35-
# timeout = 300
36-
# stats_interval = 3600
35+
# stats_interval = 300
36+
# max duration of a partition rsync
37+
# rsync_timeout = 600
38+
# passed to rsync for io op timeout
39+
# rsync_io_timeout = 10
40+
# max duration of an http request
41+
# http_timeout = 60
42+
# attempts to kill all workers if nothing replicates for lockup_timeout seconds
43+
# lockup_timeout = 900
3744
# The replicator also performs reclamation
3845
# reclaim_age = 604800
3946

swift/obj/replicator.py

+36-22
Original file line numberDiff line numberDiff line change
@@ -216,14 +216,17 @@ def __init__(self, conf):
216216
self.swift_dir = conf.get('swift_dir', '/etc/swift')
217217
self.port = int(conf.get('bind_port', 6000))
218218
self.concurrency = int(conf.get('concurrency', 1))
219-
self.timeout = conf.get('timeout', '5')
220-
self.stats_interval = int(conf.get('stats_interval', '3600'))
219+
self.stats_interval = int(conf.get('stats_interval', '300'))
221220
self.object_ring = Ring(join(self.swift_dir, 'object.ring.gz'))
222221
self.ring_check_interval = int(conf.get('ring_check_interval', 15))
223222
self.next_check = time.time() + self.ring_check_interval
224223
self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7))
225224
self.partition_times = []
226225
self.run_pause = int(conf.get('run_pause', 30))
226+
self.rsync_timeout = int(conf.get('rsync_timeout', 300))
227+
self.rsync_io_timeout = conf.get('rsync_io_timeout', '10')
228+
self.http_timeout = int(conf.get('http_timeout', 60))
229+
self.lockup_timeout = int(conf.get('lockup_timeout', 900))
227230

228231
def _rsync(self, args):
229232
"""
@@ -234,14 +237,15 @@ def _rsync(self, args):
234237
start_time = time.time()
235238
ret_val = None
236239
try:
237-
with Timeout(120):
240+
with Timeout(self.rsync_timeout):
238241
proc = subprocess.Popen(args, stdout=subprocess.PIPE,
239242
stderr=subprocess.STDOUT)
240243
results = proc.stdout.read()
241244
ret_val = proc.wait()
242-
finally:
243-
if ret_val is None:
244-
proc.kill()
245+
except Timeout:
246+
self.logger.error("Killing long-running rsync: %s" % str(args))
247+
proc.kill()
248+
return 1 # failure response code
245249
total_time = time.time() - start_time
246250
if results:
247251
for result in results.split('\n'):
@@ -259,7 +263,7 @@ def _rsync(self, args):
259263
args[-2], args[-1], total_time, ret_val))
260264
if ret_val:
261265
self.logger.error('Bad rsync return code: %d' % ret_val)
262-
return ret_val, results
266+
return ret_val
263267

264268
def rsync(self, node, job, suffixes):
265269
"""
@@ -282,8 +286,8 @@ def rsync(self, node, job, suffixes):
282286
'--xattrs',
283287
'--itemize-changes',
284288
'--ignore-existing',
285-
'--timeout=%s' % self.timeout,
286-
'--contimeout=%s' % self.timeout,
289+
'--timeout=%s' % self.rsync_io_timeout,
290+
'--contimeout=%s' % self.rsync_io_timeout,
287291
]
288292
if self.vm_test_mode:
289293
rsync_module = '%s::object%s' % (node['ip'], node['port'])
@@ -299,8 +303,7 @@ def rsync(self, node, job, suffixes):
299303
return False
300304
args.append(join(rsync_module, node['device'],
301305
'objects', job['partition']))
302-
ret_val, results = self._rsync(args)
303-
return ret_val == 0
306+
return self._rsync(args) == 0
304307

305308
def check_ring(self):
306309
"""
@@ -334,7 +337,7 @@ def tpool_get_suffixes(path):
334337
for node in job['nodes']:
335338
success = self.rsync(node, job, suffixes)
336339
if success:
337-
with Timeout(60):
340+
with Timeout(self.http_timeout):
338341
http_connect(node['ip'],
339342
node['port'],
340343
node['device'], job['partition'], 'REPLICATE',
@@ -371,7 +374,7 @@ def update(self, job):
371374
node = next(nodes)
372375
attempts_left -= 1
373376
try:
374-
with Timeout(60):
377+
with Timeout(self.http_timeout):
375378
resp = http_connect(node['ip'], node['port'],
376379
node['device'], job['partition'], 'REPLICATE',
377380
'', headers={'Content-Length': '0'}).getresponse()
@@ -394,7 +397,7 @@ def update(self, job):
394397
self.rsync(node, job, suffixes)
395398
recalculate_hashes(job['path'], suffixes,
396399
reclaim_age=self.reclaim_age)
397-
with Timeout(60):
400+
with Timeout(self.http_timeout):
398401
conn = http_connect(node['ip'], node['port'],
399402
node['device'], job['partition'], 'REPLICATE',
400403
'/' + '-'.join(suffixes),
@@ -448,16 +451,24 @@ def kill_coros(self):
448451
def heartbeat(self):
449452
"""
450453
Loop that runs in the background during replication. It periodically
451-
logs progress and attempts to detect lockups, killing any running
452-
coroutines if the replicator hasn't made progress since last hearbeat.
454+
logs progress.
453455
"""
454456
while True:
457+
eventlet.sleep(self.stats_interval)
458+
self.stats_line()
459+
460+
def detect_lockups(self):
461+
"""
462+
In testing, the pool.waitall() call very occasionally failed to return.
463+
This is an attempt to make sure the replicator finishes its replication
464+
pass in some eventuality.
465+
"""
466+
while True:
467+
eventlet.sleep(self.lockup_timeout)
455468
if self.replication_count == self.last_replication_count:
456469
self.logger.error("Lockup detected.. killing live coros.")
457470
self.kill_coros()
458471
self.last_replication_count = self.replication_count
459-
eventlet.sleep(300)
460-
self.stats_line()
461472

462473
def replicate(self):
463474
"""Run a replication pass"""
@@ -470,6 +481,7 @@ def replicate(self):
470481
self.partition_times = []
471482
jobs = []
472483
stats = eventlet.spawn(self.heartbeat)
484+
lockup_detector = eventlet.spawn(self.detect_lockups)
473485
try:
474486
ips = whataremyips()
475487
self.run_pool = GreenPool(size=self.concurrency)
@@ -508,13 +520,15 @@ def replicate(self):
508520
self.run_pool.spawn(self.update_deleted, job)
509521
else:
510522
self.run_pool.spawn(self.update, job)
511-
with Timeout(120):
523+
with Timeout(self.lockup_timeout):
512524
self.run_pool.waitall()
513525
except (Exception, Timeout):
514-
self.logger.exception("Exception while replicating")
526+
self.logger.exception("Exception in top-level replication loop")
515527
self.kill_coros()
516-
self.stats_line()
517-
stats.kill()
528+
finally:
529+
stats.kill()
530+
lockup_detector.kill()
531+
self.stats_line()
518532

519533
def run_once(self):
520534
start = time.time()

swift/obj/server.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -538,13 +538,12 @@ def REPLICATE(self, request):
538538
unquote(request.path), 2, 3, True)
539539
if self.mount_check and not check_mount(self.devices, device):
540540
return Response(status='507 %s is not mounted' % device)
541-
if suffix:
542-
recalculate_hashes(os.path.join(self.devices, device,
543-
DATADIR, partition), suffix.split('-'))
544-
return Response()
545541
path = os.path.join(self.devices, device, DATADIR, partition)
546542
if not os.path.exists(path):
547543
mkdirs(path)
544+
if suffix:
545+
recalculate_hashes(path, suffix.split('-'))
546+
return Response()
548547
_, hashes = get_hashes(path, do_listdir=False)
549548
return Response(body=pickle.dumps(hashes))
550549

0 commit comments

Comments
 (0)