Update overlay survey script with lessons learned during testnet run (#4358)

marta-lokhova · web-flow · commit d78f48eacabb · 2024-07-08T20:15:52.000Z
# Description This change makes a few tweaks to the overlay survey script to fix some small things I noticed after running it on testnet: * Changes the script's end condition to depend only on responses, and not requests. Without this it was possible for the survey script to run for the full duration of the collecting phase (2 hours) if a node with more than 25 peers stopped responding after the surveyor received the first set of peers. * Downgrades the severity of "node already in backlog" messages from `error` to `debug`. This is an expected condition that I simply forgot to special-case before. * Modifies the simulator to occasionally return "node already in backlog" messages to test the script against that case. * Adds a `--fast` option to the `simulate` mode that skips any `sleep` calls. This makes the script much nicer to test. * Fixes naming of graphml fields to match JSON result fields. * I did most of this in the V2 script update, but missed a couple spots. * Most of this change is in the simulator to support the new field names. # Checklist - [x] Reviewed the [contributing](https://github.com/stellar/stellar-core/blob/master/CONTRIBUTING.md#submitting-changes) document - [x] Rebased on top of master (no merge commits) - [ ] Ran `clang-format` v8.0.0 (via `make format` or the Visual Studio extension) - [ ] Compiles - [ ] Ran all tests - [ ] If change impacts performance, include supporting evidence per the [performance document](https://github.com/stellar/stellar-core/blob/master/performance-eval/performance-eval.md)
diff --git a/scripts/OverlaySurvey.py b/scripts/OverlaySurvey.py
@@ -84,15 +84,14 @@
 # internal limit.
 MAX_COLLECT_DURATION = 30
 
-# Maximum number of consecutive rounds in which the surveyor neither sent
-# requests to nor received responses from any nodes. A round contains a batch of
-# requests sent to select nodes, followed by a wait period of 15 seconds,
-# followed by checking for responses and building up the next batch of requests
-# to send. Therefore, a setting of `8` is roughly 2 minutes of inactivity
-# before the script considers the survey complete. This is necessary because
-# it's very likely that not all surveyed nodes will respond to the survey.
-# Therefore, we need some cutoff after we which we assume those nodes will never
-# respond.
+# Maximum number of consecutive rounds in which the surveyor does not receive
+# responses from any nodes. A round contains a batch of requests sent to select
+# nodes, followed by a wait period of 15 seconds, followed by checking for
+# responses and building up the next batch of requests to send. Therefore, a
+# setting of `8` is roughly 2 minutes of inactivity before the script considers
+# the survey complete. This is necessary because it's very likely that not all
+# surveyed nodes will respond to the survey.  Therefore, we need some cutoff
+# after we which we assume those nodes will never respond.
 MAX_INACTIVE_ROUNDS = 8
 
 def get_request(url, params=None):
@@ -129,15 +128,14 @@ def get_next_peers(topology):
 
 def update_node(graph, node_info, node_key, results, field_names):
     """
-    For each `(info_field, node_field)` pair in `field_names`, if `info_field`
-    is in `node_info`, modify the node in `graph` with key `node_key` to store
-    the value of `info_field` in `node_field`.
+    For each `field_name` in `field_names`, if `field_name` is in `node_info`,
+    modify `graph` and `results` to contain the field.
     """
-    for (info_field, node_field) in field_names:
-        if info_field in node_info:
-            val = node_info[info_field]
-            results[node_field] = val
-            graph.add_node(node_key, **{node_field: val})
+    for field_name in field_names:
+        if field_name in node_info:
+            val = node_info[field_name]
+            results[field_name] = val
+            graph.add_node(node_key, **{field_name: val})
 
 def update_results(graph, parent_info, parent_key, results, is_inbound):
     direction_tag = "inboundPeers" if is_inbound else "outboundPeers"
@@ -158,16 +156,16 @@ def update_results(graph, parent_info, parent_key, results, is_inbound):
             graph.add_edge(parent_key, other_key, **edge_properties)
 
     # Add survey results to parent node (if available)
-    field_names = [("numTotalInboundPeers", "totalInbound"),
-                   ("numTotalOutboundPeers", "totalOutbound"),
-                   ("maxInboundPeerCount", "maxInboundPeerCount"),
-                   ("maxOutboundPeerCount", "maxOutboundPeerCount"),
-                   ("addedAuthenticatedPeers", "addedAuthenticatedPeers"),
-                   ("droppedAuthenticatedPeers", "droppedAuthenticatedPeers"),
-                   ("p75SCPFirstToSelfLatencyMs", "p75SCPFirstToSelfLatencyMs"),
-                   ("p75SCPSelfToOtherLatencyMs", "p75SCPSelfToOtherLatencyMs"),
-                   ("lostSyncCount", "lostSyncCount"),
-                   ("isValidator", "isValidator")]
+    field_names = ["numTotalInboundPeers",
+                   "numTotalOutboundPeers",
+                   "maxInboundPeerCount",
+                   "maxOutboundPeerCount",
+                   "addedAuthenticatedPeers",
+                   "droppedAuthenticatedPeers",
+                   "p75SCPFirstToSelfLatencyMs",
+                   "p75SCPSelfToOtherLatencyMs",
+                   "lostSyncCount",
+                   "isValidator"]
     update_node(graph, parent_info, parent_key, results, field_names)
 
 
@@ -187,8 +185,18 @@ def send_survey_requests(peer_list, url_base):
             util.SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START):
             logger.debug("Send request to %s", nodeid)
         else:
-            logger.error("Failed to send survey request to %s: %s",
-                         nodeid, response.text)
+            try:
+                exception = response.json()["exception"]
+                if exception == \
+                   util.SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF:
+                    logger.debug("Node %s is already in backlog or is self",
+                                 nodeid)
+                else:
+                    logger.error("Failed to send survey request to %s: %s",
+                                nodeid, exception)
+            except (requests.exceptions.JSONDecodeError, KeyError):
+                logger.error("Failed to send survey request to %s: %s",
+                             nodeid, response.text)
 
     logger.info("Done sending survey requests")
 
@@ -309,8 +317,8 @@ def augment(args):
 def run_survey(args):
     graph = nx.DiGraph()
     merged_results = defaultdict(lambda: {
-        "totalInbound": 0,
-        "totalOutbound": 0,
+        "numTotalInboundPeers": 0,
+        "numTotalOutboundPeers": 0,
         "maxInboundPeerCount": 0,
         "maxOutboundPeerCount": 0,
         "inboundPeers": {},
@@ -324,6 +332,7 @@ def run_survey(args):
             logger.critical("%s", e)
             sys.exit(1)
 
+    skip_sleep = args.simulate and args.fast
     url = args.node
 
     peers = url + "/peers"
@@ -339,10 +348,11 @@ def run_survey(args):
         logger.critical("Failed to start survey: %s", response.text)
         sys.exit(1)
 
-    # Sleep for duration of collecting phase
-    logger.info("Sleeping for collecting phase (%i minutes)",
-                args.collect_duration)
-    time.sleep(args.collect_duration * 60)
+    if not skip_sleep:
+        # Sleep for duration of collecting phase
+        logger.info("Sleeping for collecting phase (%i minutes)",
+                    args.collect_duration)
+        time.sleep(args.collect_duration * 60)
 
     # Stop survey recording
     logger.info("Stopping survey collecting")
@@ -351,12 +361,13 @@ def run_survey(args):
         logger.critical("Failed to stop survey: %s", response.text)
         sys.exit(1)
 
-    # Allow time for stop message to propagate
-    sleep_time = 60
-    logger.info(
-        "Waiting %i seconds for 'stop collecting' message to propagate",
-        sleep_time)
-    time.sleep(sleep_time)
+    if not skip_sleep:
+        # Allow time for stop message to propagate
+        sleep_time = 60
+        logger.info(
+            "Waiting %i seconds for 'stop collecting' message to propagate",
+            sleep_time)
+        time.sleep(sleep_time)
 
     peer_list = set()
     if args.nodeList:
@@ -387,16 +398,14 @@ def run_survey(args):
 
     sent_requests = set()
     heard_from = set()
+    incomplete_responses = set()
 
     # Number of consecutive rounds in which surveyor neither sent requests nor
     # received responses
     inactive_rounds = 0
 
     while True:
-        if peer_list:
-            inactive_rounds = 0
-        else:
-            inactive_rounds += 1
+        inactive_rounds += 1
 
         send_survey_requests(peer_list, url)
 
@@ -405,25 +414,33 @@ def run_survey(args):
 
         peer_list = set()
 
-        # allow time for results. Stellar-core sends out a batch of requests
-        # every 15 seconds, so there's not much benefit in checking more
-        # frequently than that
-        sleep_time = 15
-        logger.info("Waiting %i seconds for survey results", sleep_time)
-        time.sleep(sleep_time)
+        if not skip_sleep:
+            # allow time for results. Stellar-core sends out a batch of requests
+            # every 15 seconds, so there's not much benefit in checking more
+            # frequently than that
+            sleep_time = 15
+            logger.info("Waiting %i seconds for survey results", sleep_time)
+            time.sleep(sleep_time)
 
         logger.info("Fetching survey result")
         data = get_request(url=survey_result).json()
         logger.info("Done fetching result")
 
         if "topology" in data:
             for key in data["topology"]:
-                if data["topology"][key] is not None:
+                node_data = data["topology"][key]
+                if node_data is not None:
                     if key not in heard_from:
                         # Received a new response!
                         logger.debug("Received response from %s", key)
                         inactive_rounds = 0
                         heard_from.add(key)
+                    elif key in incomplete_responses and len(node_data) > 0:
+                        # Received additional data for a node that previously
+                        # responded
+                        logger.debug("Received additional data for %s", key)
+                        inactive_rounds = 0
+                        incomplete_responses.remove(key)
 
         waiting_to_hear = set()
         for node in sent_requests:
@@ -455,11 +472,11 @@ def run_survey(args):
             node = merged_results[key]
             have_inbound = len(node["inboundPeers"])
             have_outbound = len(node["outboundPeers"])
-            if (node["totalInbound"] > have_inbound or
-                node["totalOutbound"] > have_outbound):
-                peer_list.add(util.PendingRequest(key,
-                                                  have_inbound,
-                                                  have_outbound))
+            if (node["numTotalInboundPeers"] > have_inbound or
+                node["numTotalOutboundPeers"] > have_outbound):
+                incomplete_responses.add(key)
+                req = util.PendingRequest(key, have_inbound, have_outbound)
+                peer_list.add(req)
         logger.info("New nodes: %s  Gathering additional peer data: %s",
               new_peers, len(peer_list)-new_peers)
 
@@ -554,6 +571,10 @@ def main():
                                  "--simRoot",
                                  required=True,
                                  help="node to start simulation from")
+    parser_simulate.add_argument("-f",
+                                 "--fast",
+                                 action="store_true",
+                                 help="Skip sleep calls during simulation.")
     parser_simulate.set_defaults(simulate=True)
 
     parser_analyze = subparsers.add_parser('analyze',
diff --git a/scripts/overlay_survey/simulation.py b/scripts/overlay_survey/simulation.py
@@ -63,7 +63,6 @@ def _add_v2_survey_data(node_json):
     for peer in node_json["outboundPeers"]:
         peer["averageLatencyMs"] = random.randint(0, 2**32-1)
 
-
 class SurveySimulation:
     """
     Simulates the HTTP endpoints of stellar-core's overlay survey. Raises
@@ -139,11 +138,25 @@ def _surveytopologytimesliced(self, params):
         assert params.keys() == {"node",
                                  "inboundpeerindex",
                                  "outboundpeerindex"}
-        if params["node"] != self._root_node:
-            req = util.PendingRequest(params["node"],
-                                      params["inboundpeerindex"],
-                                      params["outboundpeerindex"])
-            self._pending_requests.append(req)
+
+        fail_response = SimulatedResponse(
+            {"exception" :
+                util.SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF})
+        node = params["node"]
+        inbound_peer_idx = params["inboundpeerindex"]
+        outbound_peer_idx = params["outboundpeerindex"]
+        if node == self._root_node:
+            # Nodes cannot survey themselves (yet)
+            return fail_response
+
+        if ((inbound_peer_idx > 0 or outbound_peer_idx > 0) and
+            random.random() < 0.2):
+            # Randomly indicate that node is already in backlog if it is being
+            # resurveyed. Script should handle this by trying again later.
+            return fail_response
+
+        req = util.PendingRequest(node, inbound_peer_idx, outbound_peer_idx)
+        self._pending_requests.append(req)
         return SimulatedResponse(
             text=util.SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_TEXT)
 
diff --git a/scripts/overlay_survey/util.py b/scripts/overlay_survey/util.py
@@ -25,4 +25,13 @@
 # "Adding node."
 SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START = "Adding node."
 SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_TEXT = \
-    SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START + "Survey already running!"
+    SURVEY_TOPOLOGY_TIME_SLICED_SUCCESS_START + "Survey already running!"
+
+# The error response from the surveytopologytimesliced endpoint when the survey
+# backlog already contains the node requested to be surveyed, or the requested
+# node is the surveyor. stellar-core returns this error JSON object where the
+# error text is contained in the "exception" field.
+SURVEY_TOPOLOGY_TIME_SLICED_ALREADY_IN_BACKLOG_OR_SELF = (
+        "addPeerToBacklog failed: Peer is already in the backlog, or peer "
+        "is self."
+        )