Skip to content

Commit 0862a6a

Browse files
authored
Merge pull request kocolosk#1 from natarajaya/GPII-3624
[GPII-3624]: Improve peer discovery mechanics
2 parents 6f0d06d + b19c942 commit 0862a6a

File tree

1 file changed

+28
-5
lines changed

1 file changed

+28
-5
lines changed

mem3_helper.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,45 @@
1313
import backoff
1414
import os
1515

16+
class PeerDiscoveryException(Exception):
17+
pass
18+
1619
def construct_service_record():
1720
# Drop our Pod's unique identity and replace with '_couchdb._tcp'
1821
return os.getenv('SRV_RECORD') or '.'.join(['_couchdb', '_tcp'] + socket.getfqdn().split('.')[1:])
1922

2023
@backoff.on_exception(
2124
backoff.expo,
2225
dns.resolver.NXDOMAIN,
23-
max_tries=10
26+
max_tries=15
27+
)
28+
@backoff.on_exception(
29+
backoff.expo,
30+
PeerDiscoveryException,
31+
max_tries=15
2432
)
2533
def discover_peers(service_record):
26-
print ('Resolving SRV record', service_record)
27-
answers = dns.resolver.query(service_record, 'SRV')
34+
expected_peers_count = os.getenv('COUCHDB_CLUSTER_SIZE')
35+
if expected_peers_count:
36+
expected_peers_count = int(expected_peers_count)
37+
print('Expecting', expected_peers_count, 'peers...')
38+
else:
39+
print('Looks like COUCHDB_CLUSTER_SIZE is not set, will not wait for DNS to fully propagate...')
40+
print('Resolving SRV record:', service_record)
2841
# Erlang requires that we drop the trailing period from the absolute DNS
2942
# name to form the hostname used for the Erlang node. This feels hacky
3043
# but not sure of a more official answer
31-
return [rdata.target.to_text()[:-1] for rdata in answers]
44+
answers = dns.resolver.query(service_record, 'SRV')
45+
peers = [rdata.target.to_text()[:-1] for rdata in answers]
46+
peers_count = len(peers)
47+
if expected_peers_count:
48+
print('Discovered', peers_count, 'of', expected_peers_count, 'peers:', peers)
49+
if peers_count != expected_peers_count:
50+
print('Waiting for cluster DNS to fully propagate...')
51+
raise PeerDiscoveryException
52+
else:
53+
print('Discovered', peers_count, 'peers:', peers)
54+
return peers
3255

3356
@backoff.on_exception(
3457
backoff.expo,
@@ -45,7 +68,7 @@ def connect_the_dots(names):
4568
else:
4669
resp = requests.put(uri, data=json.dumps(doc))
4770
while resp.status_code == 404:
48-
print('Waiting for _nodes DB to be created ...')
71+
print('Waiting for _nodes DB to be created...')
4972
time.sleep(5)
5073
resp = requests.put(uri, data=json.dumps(doc))
5174
print('Adding cluster member', name, resp.status_code)

0 commit comments

Comments
 (0)