13
13
import backoff
14
14
import os
15
15
16
+ class PeerDiscoveryException (Exception ):
17
+ pass
18
+
16
19
def construct_service_record ():
17
20
# Drop our Pod's unique identity and replace with '_couchdb._tcp'
18
21
return os .getenv ('SRV_RECORD' ) or '.' .join (['_couchdb' , '_tcp' ] + socket .getfqdn ().split ('.' )[1 :])
19
22
20
23
@backoff .on_exception (
21
24
backoff .expo ,
22
25
dns .resolver .NXDOMAIN ,
23
- max_tries = 10
26
+ max_tries = 15
27
+ )
28
+ @backoff .on_exception (
29
+ backoff .expo ,
30
+ PeerDiscoveryException ,
31
+ max_tries = 15
24
32
)
25
33
def discover_peers (service_record ):
26
- print ('Resolving SRV record' , service_record )
27
- answers = dns .resolver .query (service_record , 'SRV' )
34
+ expected_peers_count = os .getenv ('COUCHDB_CLUSTER_SIZE' )
35
+ if expected_peers_count :
36
+ expected_peers_count = int (expected_peers_count )
37
+ print ('Expecting' , expected_peers_count , 'peers...' )
38
+ else :
39
+ print ('Looks like COUCHDB_CLUSTER_SIZE is not set, will not wait for DNS to fully propagate...' )
40
+ print ('Resolving SRV record:' , service_record )
28
41
# Erlang requires that we drop the trailing period from the absolute DNS
29
42
# name to form the hostname used for the Erlang node. This feels hacky
30
43
# but not sure of a more official answer
31
- return [rdata .target .to_text ()[:- 1 ] for rdata in answers ]
44
+ answers = dns .resolver .query (service_record , 'SRV' )
45
+ peers = [rdata .target .to_text ()[:- 1 ] for rdata in answers ]
46
+ peers_count = len (peers )
47
+ if expected_peers_count :
48
+ print ('Discovered' , peers_count , 'of' , expected_peers_count , 'peers:' , peers )
49
+ if peers_count != expected_peers_count :
50
+ print ('Waiting for cluster DNS to fully propagate...' )
51
+ raise PeerDiscoveryException
52
+ else :
53
+ print ('Discovered' , peers_count , 'peers:' , peers )
54
+ return peers
32
55
33
56
@backoff .on_exception (
34
57
backoff .expo ,
@@ -45,7 +68,7 @@ def connect_the_dots(names):
45
68
else :
46
69
resp = requests .put (uri , data = json .dumps (doc ))
47
70
while resp .status_code == 404 :
48
- print ('Waiting for _nodes DB to be created ...' )
71
+ print ('Waiting for _nodes DB to be created...' )
49
72
time .sleep (5 )
50
73
resp = requests .put (uri , data = json .dumps (doc ))
51
74
print ('Adding cluster member' , name , resp .status_code )
0 commit comments