Skip to content

Commit d78f48e

Browse files
Update overlay survey script with lessons learned during testnet run (#4358)
# Description This change makes a few tweaks to the overlay survey script to fix some small things I noticed after running it on testnet: * Changes the script's end condition to depend only on responses, and not requests. Without this it was possible for the survey script to run for the full duration of the collecting phase (2 hours) if a node with more than 25 peers stopped responding after the surveyor received the first set of peers. * Downgrades the severity of "node already in backlog" messages from `error` to `debug`. This is an expected condition that I simply forgot to special-case before. * Modifies the simulator to occasionally return "node already in backlog" messages to test the script against that case. * Adds a `--fast` option to the `simulate` mode that skips any `sleep` calls. This makes the script much nicer to test. * Fixes naming of graphml fields to match JSON result fields. * I did most of this in the V2 script update, but missed a couple spots. * Most of this change is in the simulator to support the new field names. # Checklist - [x] Reviewed the [contributing](https://github.com/stellar/stellar-core/blob/master/CONTRIBUTING.md#submitting-changes) document - [x] Rebased on top of master (no merge commits) - [ ] Ran `clang-format` v8.0.0 (via `make format` or the Visual Studio extension) - [ ] Compiles - [ ] Ran all tests - [ ] If change impacts performance, include supporting evidence per the [performance document](https://github.com/stellar/stellar-core/blob/master/performance-eval/performance-eval.md)
2 parents 07e8569 + e268256 commit d78f48e

File tree

3 files changed

+107
-64
lines changed

3 files changed

+107
-64
lines changed

scripts/OverlaySurvey.py

Lines changed: 78 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,14 @@
8484
# internal limit.
8585
MAX_COLLECT_DURATION = 30
8686

87-
# Maximum number of consecutive rounds in which the surveyor neither sent
88-
# requests to nor received responses from any nodes. A round contains a batch of
89-
# requests sent to select nodes, followed by a wait period of 15 seconds,
90-
# followed by checking for responses and building up the next batch of requests
91-
# to send. Therefore, a setting of `8` is roughly 2 minutes of inactivity
92-
# before the script considers the survey complete. This is necessary because
93-
# it's very likely that not all surveyed nodes will respond to the survey.
94-
# Therefore, we need some cutoff after we which we assume those nodes will never
95-
# respond.
87+
# Maximum number of consecutive rounds in which the surveyor does not receive
88+
# responses from any nodes. A round contains a batch of requests sent to select
89+
# nodes, followed by a wait period of 15 seconds, followed by checking for
90+
# responses and building up the next batch of requests to send. Therefore, a
91+
# setting of `8` is roughly 2 minutes of inactivity before the script considers
92+
# the survey complete. This is necessary because it's very likely that not all
93+
# surveyed nodes will respond to the survey. Therefore, we need some cutoff
94+
# after we which we assume those nodes will never respond.
9695
MAX_INACTIVE_ROUNDS = 8
9796

9897
def get_request(url, params=None):
@@ -129,15 +128,14 @@ def get_next_peers(topology):
129128

130129
def update_node(graph, node_info, node_key, results, field_names):
131130
"""
132-
For each `(info_field, node_field)` pair in `field_names`, if `info_field`
133-
is in `node_info`, modify the node in `graph` with key `node_key` to store
134-
the value of `info_field` in `node_field`.
131+
For each `field_name` in `field_names`, if `field_name` is in `node_info`,
132+
modify `graph` and `results` to contain the field.
135133
"""
136-
for (info_field, node_field) in field_names:
137-
if info_field in node_info:
138-
val = node_info[info_field]
139-
results[node_field] = val
140-
graph.add_node(node_key, **{node_field: val})
134+
for field_name in field_names:
135+
if field_name in node_info:
136+
val = node_info[field_name]
137+
results[field_name] = val
138+
graph.add_node(node_key, **{field_name: val})
141139

142140
def update_results(graph, parent_info, parent_key, results, is_inbound):
143141
direction_tag = "inboundPeers" if is_inbound else "outboundPeers"
@@ -158,16 +156,16 @@ def update_results(graph, parent_info, parent_key, results, is_inbound):
158156
graph.add_edge(parent_key, other_key, **edge_properties)
159157

160158
# Add survey results to parent node (if available)
161-
field_names = [("numTotalInboundPeers", "totalInbound"),
162-
("numTotalOutboundPeers", "totalOutbound"),
163-
("maxInboundPeerCount", "maxInboundPeerCount"),
164-
("maxOutboundPeerCount", "maxOutboundPeerCount"),
165-
("addedAuthenticatedPeers", "addedAuthenticatedPeers"),
166-
("droppedAuthenticatedPeers", "droppedAuthenticatedPeers"),
167-
("p75SCPFirstToSelfLatencyMs", "p75SCPFirstToSelfLatencyMs"),
168-
("p75SCPSelfToOtherLatencyMs", "p75SCPSelfToOtherLatencyMs"),
169-
("lostSyncCount", "lostSyncCount"),
170-
("isValidator", "isValidator")]
159+
field_names = ["numTotalInboundPeers",
160+
"numTotalOutboundPeers",
161+
"maxInboundPeerCount",
162+
"maxOutboundPeerCount",
163+
"addedAuthenticatedPeers",
164+
"droppedAuthenticatedPeers",
165+
"p75SCPFirstToSelfLatencyMs",
166+
"p75SCPSelfToOtherLatencyMs",
167+
"lostSyncCount",
168+
"isValidator"]
171169
update_node(graph, parent_info, parent_key, results, field_names)
172170

173171

@@ -187,8 +185,18 @@ def send_survey_requests(peer_list, url_base):
187185
util.SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START):
188186
logger.debug("Send request to %s", nodeid)
189187
else:
190-
logger.error("Failed to send survey request to %s: %s",
191-
nodeid, response.text)
188+
try:
189+
exception = response.json()["exception"]
190+
if exception == \
191+
util.SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF:
192+
logger.debug("Node %s is already in backlog or is self",
193+
nodeid)
194+
else:
195+
logger.error("Failed to send survey request to %s: %s",
196+
nodeid, exception)
197+
except (requests.exceptions.JSONDecodeError, KeyError):
198+
logger.error("Failed to send survey request to %s: %s",
199+
nodeid, response.text)
192200

193201
logger.info("Done sending survey requests")
194202

@@ -309,8 +317,8 @@ def augment(args):
309317
def run_survey(args):
310318
graph = nx.DiGraph()
311319
merged_results = defaultdict(lambda: {
312-
"totalInbound": 0,
313-
"totalOutbound": 0,
320+
"numTotalInboundPeers": 0,
321+
"numTotalOutboundPeers": 0,
314322
"maxInboundPeerCount": 0,
315323
"maxOutboundPeerCount": 0,
316324
"inboundPeers": {},
@@ -324,6 +332,7 @@ def run_survey(args):
324332
logger.critical("%s", e)
325333
sys.exit(1)
326334

335+
skip_sleep = args.simulate and args.fast
327336
url = args.node
328337

329338
peers = url + "/peers"
@@ -339,10 +348,11 @@ def run_survey(args):
339348
logger.critical("Failed to start survey: %s", response.text)
340349
sys.exit(1)
341350

342-
# Sleep for duration of collecting phase
343-
logger.info("Sleeping for collecting phase (%i minutes)",
344-
args.collect_duration)
345-
time.sleep(args.collect_duration * 60)
351+
if not skip_sleep:
352+
# Sleep for duration of collecting phase
353+
logger.info("Sleeping for collecting phase (%i minutes)",
354+
args.collect_duration)
355+
time.sleep(args.collect_duration * 60)
346356

347357
# Stop survey recording
348358
logger.info("Stopping survey collecting")
@@ -351,12 +361,13 @@ def run_survey(args):
351361
logger.critical("Failed to stop survey: %s", response.text)
352362
sys.exit(1)
353363

354-
# Allow time for stop message to propagate
355-
sleep_time = 60
356-
logger.info(
357-
"Waiting %i seconds for 'stop collecting' message to propagate",
358-
sleep_time)
359-
time.sleep(sleep_time)
364+
if not skip_sleep:
365+
# Allow time for stop message to propagate
366+
sleep_time = 60
367+
logger.info(
368+
"Waiting %i seconds for 'stop collecting' message to propagate",
369+
sleep_time)
370+
time.sleep(sleep_time)
360371

361372
peer_list = set()
362373
if args.nodeList:
@@ -387,16 +398,14 @@ def run_survey(args):
387398

388399
sent_requests = set()
389400
heard_from = set()
401+
incomplete_responses = set()
390402

391403
# Number of consecutive rounds in which surveyor neither sent requests nor
392404
# received responses
393405
inactive_rounds = 0
394406

395407
while True:
396-
if peer_list:
397-
inactive_rounds = 0
398-
else:
399-
inactive_rounds += 1
408+
inactive_rounds += 1
400409

401410
send_survey_requests(peer_list, url)
402411

@@ -405,25 +414,33 @@ def run_survey(args):
405414

406415
peer_list = set()
407416

408-
# allow time for results. Stellar-core sends out a batch of requests
409-
# every 15 seconds, so there's not much benefit in checking more
410-
# frequently than that
411-
sleep_time = 15
412-
logger.info("Waiting %i seconds for survey results", sleep_time)
413-
time.sleep(sleep_time)
417+
if not skip_sleep:
418+
# allow time for results. Stellar-core sends out a batch of requests
419+
# every 15 seconds, so there's not much benefit in checking more
420+
# frequently than that
421+
sleep_time = 15
422+
logger.info("Waiting %i seconds for survey results", sleep_time)
423+
time.sleep(sleep_time)
414424

415425
logger.info("Fetching survey result")
416426
data = get_request(url=survey_result).json()
417427
logger.info("Done fetching result")
418428

419429
if "topology" in data:
420430
for key in data["topology"]:
421-
if data["topology"][key] is not None:
431+
node_data = data["topology"][key]
432+
if node_data is not None:
422433
if key not in heard_from:
423434
# Received a new response!
424435
logger.debug("Received response from %s", key)
425436
inactive_rounds = 0
426437
heard_from.add(key)
438+
elif key in incomplete_responses and len(node_data) > 0:
439+
# Received additional data for a node that previously
440+
# responded
441+
logger.debug("Received additional data for %s", key)
442+
inactive_rounds = 0
443+
incomplete_responses.remove(key)
427444

428445
waiting_to_hear = set()
429446
for node in sent_requests:
@@ -455,11 +472,11 @@ def run_survey(args):
455472
node = merged_results[key]
456473
have_inbound = len(node["inboundPeers"])
457474
have_outbound = len(node["outboundPeers"])
458-
if (node["totalInbound"] > have_inbound or
459-
node["totalOutbound"] > have_outbound):
460-
peer_list.add(util.PendingRequest(key,
461-
have_inbound,
462-
have_outbound))
475+
if (node["numTotalInboundPeers"] > have_inbound or
476+
node["numTotalOutboundPeers"] > have_outbound):
477+
incomplete_responses.add(key)
478+
req = util.PendingRequest(key, have_inbound, have_outbound)
479+
peer_list.add(req)
463480
logger.info("New nodes: %s Gathering additional peer data: %s",
464481
new_peers, len(peer_list)-new_peers)
465482

@@ -554,6 +571,10 @@ def main():
554571
"--simRoot",
555572
required=True,
556573
help="node to start simulation from")
574+
parser_simulate.add_argument("-f",
575+
"--fast",
576+
action="store_true",
577+
help="Skip sleep calls during simulation.")
557578
parser_simulate.set_defaults(simulate=True)
558579

559580
parser_analyze = subparsers.add_parser('analyze',

scripts/overlay_survey/simulation.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ def _add_v2_survey_data(node_json):
6363
for peer in node_json["outboundPeers"]:
6464
peer["averageLatencyMs"] = random.randint(0, 2**32-1)
6565

66-
6766
class SurveySimulation:
6867
"""
6968
Simulates the HTTP endpoints of stellar-core's overlay survey. Raises
@@ -139,11 +138,25 @@ def _surveytopologytimesliced(self, params):
139138
assert params.keys() == {"node",
140139
"inboundpeerindex",
141140
"outboundpeerindex"}
142-
if params["node"] != self._root_node:
143-
req = util.PendingRequest(params["node"],
144-
params["inboundpeerindex"],
145-
params["outboundpeerindex"])
146-
self._pending_requests.append(req)
141+
142+
fail_response = SimulatedResponse(
143+
{"exception" :
144+
util.SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF})
145+
node = params["node"]
146+
inbound_peer_idx = params["inboundpeerindex"]
147+
outbound_peer_idx = params["outboundpeerindex"]
148+
if node == self._root_node:
149+
# Nodes cannot survey themselves (yet)
150+
return fail_response
151+
152+
if ((inbound_peer_idx > 0 or outbound_peer_idx > 0) and
153+
random.random() < 0.2):
154+
# Randomly indicate that node is already in backlog if it is being
155+
# resurveyed. Script should handle this by trying again later.
156+
return fail_response
157+
158+
req = util.PendingRequest(node, inbound_peer_idx, outbound_peer_idx)
159+
self._pending_requests.append(req)
147160
return SimulatedResponse(
148161
text=util.SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_TEXT)
149162

scripts/overlay_survey/util.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,13 @@
2525
# "Adding node."
2626
SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START = "Adding node."
2727
SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_TEXT = \
28-
SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START + "Survey already running!"
28+
SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START + "Survey already running!"
29+
30+
# The error response from the surveytopologytimesliced endpoint when the survey
31+
# backlog already contains the node requested to be surveyed, or the requested
32+
# node is the surveyor. stellar-core returns this error JSON object where the
33+
# error text is contained in the "exception" field.
34+
SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF = (
35+
"addPeerToBacklog failed: Peer is already in the backlog, or peer "
36+
"is self."
37+
)

0 commit comments

Comments
 (0)