Add clearer retry logic with logging [RHELDST-9679]

amcmahon-rh · amcmahon-rh · commit d2b47e808279 · 2024-02-02T15:57:08.000+01:00
Poor retry logic in this library has made some pub tasks fail. The logs
are also unclear as to whether retries are occurring or not. This change
extends the urllib3 retry logic to include logging to resolve both
issues. It also introduces new envvars to make the retry periods
configurable. The default backoff settings retry the request 10 times
over a 5 minute period.
diff --git a/fastpurge/_client.py b/fastpurge/_client.py
@@ -6,6 +6,10 @@
 from threading import local, Lock
 
 import requests
+from requests.adapters import HTTPAdapter
+from requests.exceptions import RetryError
+from urllib3.util import Retry
+from http import HTTPStatus
 
 try:
     from time import monotonic
@@ -32,6 +36,29 @@
 ])
 
 
+class LoggingRetry(Retry):
+    def __init__(self, *args, **kwargs, ):
+        self._logger = kwargs.pop('logger', None)
+        super(LoggingRetry, self).__init__(*args, **kwargs)
+
+    def new(self, **kw):
+        kw['logger'] = self._logger
+        return super(LoggingRetry, self).new(**kw)
+
+    def increment(self, method, url, *args, **kwargs):
+        response = kwargs.get("response")
+        if response:
+            self._logger.error("An invalid status code %s was received "
+                               "when trying to %s to %s: %s",
+                               response.status, method, url, response.reason)
+        else:  # pragma: no cover
+            self._logger.error(
+                "An unknown error occurred when trying to %s to %s", method,
+                url)
+        return super(LoggingRetry, self).increment(method, url, *args,
+                                                   **kwargs)
+
+
 class FastPurgeError(RuntimeError):
     """Raised when the Fast Purge API reports an error.
 
@@ -74,6 +101,11 @@ class FastPurgeClient(object):
     # Default network matches Akamai's documented default
     DEFAULT_NETWORK = os.environ.get("FAST_PURGE_DEFAULT_NETWORK", "production")
 
+    # Max number of retries allowed for HTTP requests, and the backoff used
+    # to extend the delay between requests.
+    MAX_RETRIES = int(os.environ.get("FAST_PURGE_MAX_RETRIES", "10"))
+
+    RETRY_BACKOFF = float(os.environ.get("FAST_PURGE_RETRY_BACKOFF", "0.15"))
     # Default purge type.
     # Akamai recommend "invalidate", so why is "delete" our default?
     # Here's what Akamai docs have to say:
@@ -197,12 +229,32 @@ def __baseurl(self):
 
         return '{out}:{port}'.format(out=out, port=self.__port)
 
+    @property
+    def __retry_policy(self):
+        retries = getattr(self.__local, 'retries', None)
+        if not retries:
+            retries = LoggingRetry(
+                total=self.MAX_RETRIES,
+                backoff_factor=self.RETRY_BACKOFF,
+                # We strictly require 201 here since that's how the server
+                # tells us we queued something async, as expected
+                status_forcelist=[status.value for status in HTTPStatus
+                                  if status.value != 201],
+                allowed_methods={'POST'},
+                logger=LOG,
+            )
+            self.__local.retries = retries
+        return retries
+
     @property
     def __session(self):
         session = getattr(self.__local, 'session', None)
         if not session:
             session = requests.Session()
             session.auth = EdgeGridAuth(**self.__auth)
+            session.mount(self.__baseurl,
+                          HTTPAdapter(max_retries=self.__retry_policy))
+
             self.__local.session = session
         return session
 
@@ -223,21 +275,16 @@ def __get_request_bodies(self, objects):
     def __start_purge(self, endpoint, request_body):
         headers = {'Content-Type': 'application/json'}
         LOG.debug("POST JSON of size %s to %s", len(request_body), endpoint)
-
-        response = self.__session.post(endpoint, data=request_body, headers=headers)
-
-        # Did it succeed?  We strictly require 201 here since that's how the server tells
-        # us we queued something async, as expected
-        if response.status_code != 201:
-            message = "Request to {endpoint} failed: {r.status_code} {r.reason} {text}".\
-                      format(endpoint=endpoint, r=response, text=response.text[0:800])
+        try:
+            response = self.__session.post(endpoint, data=request_body, headers=headers)
+            response_body = response.json()
+            estimated_seconds = response_body.get('estimatedSeconds', 5)
+            return Purge(response_body, monotonic() + estimated_seconds)
+        except RetryError as e:
+            message = "Request to {endpoint} was unsuccessful after {retries} retries: {reason}". \
+                format(endpoint=endpoint, retries=self.MAX_RETRIES, reason=e.args[0].reason)
             LOG.debug("%s", message)
-            raise FastPurgeError(message)
-
-        response_body = response.json()
-        estimated_seconds = response_body.get('estimatedSeconds', 5)
-
-        return Purge(response_body, monotonic() + estimated_seconds)
+            raise FastPurgeError(message) from e
 
     def purge_objects(self, object_type, objects, **kwargs):
         """Purge a collection of objects.
diff --git a/test-requirements.txt b/test-requirements.txt
@@ -2,3 +2,4 @@ pytest
 requests-mock
 mock
 bandit==1.7.5;python_version > '3'
+responses
diff --git a/tests/test_purge.py b/tests/test_purge.py
@@ -1,6 +1,7 @@
 import pytest
 import requests_mock
 import mock
+import responses
 
 try:
     from time import monotonic
@@ -37,7 +38,7 @@ def requests_mocker():
 
 
 @pytest.fixture
-def no_retries():
+def no_thread_retries():
     """Suppress retries for the duration of this fixture."""
 
     with mock.patch('more_executors.retry.ExceptionRetryPolicy') as policy_class:
@@ -131,20 +132,20 @@ def test_scheme_port(client_auth, requests_mocker):
     assert future.result()
 
 
-def test_response_fails(client, requests_mocker, no_retries):
+@responses.activate
+def test_response_fails(client,  no_thread_retries, monkeypatch):
     """Requests fail with a FastPurgeError if API gives unsuccessful response."""
+    url = 'https://fastpurge.example.com/ccu/v3/delete/cpcode/production'
+    # Decrease backoff, otherwise the test will run for 5 minutes
+    monkeypatch.setenv("FAST_PURGE_RETRY_BACKOFF", "0.001")
 
-    requests_mocker.register_uri(
-        method='POST',
-        url='https://fastpurge.example.com/ccu/v3/delete/cpcode/production',
-        status_code=503,
-        reason='simulated internal error')
-
+    responses.add(responses.POST, url, status=503,
+                  content_type="application/json", body="Error")
     future = client.purge_by_cpcode([1234, 5678])
     exception = future.exception()
 
     assert isinstance(exception, FastPurgeError)
-    assert '503 simulated internal error' in str(exception)
+    assert 'too many 503 error responses' in str(exception)
 
 
 def test_split_requests(client, requests_mocker):
@@ -201,3 +202,25 @@ def test_multiple_clients_with_the_same_auth_dict(client_auth):
     client2 = FastPurgeClient(auth=client_auth)
 
     assert client1 is not client2
+
+
+@responses.activate(registry=responses.registries.OrderedRegistry)
+def test_retries_on_error(client_auth):
+    """Sanity check for the retry functionality"""
+    url = 'http://fastpurge.example.com:42/ccu/v3/delete/tag/staging'
+    err_1 = responses.add(responses.POST, url, status=500,
+                          content_type="application/json", body="Error")
+    err_2 = responses.add(responses.POST, url, status=501,
+                          content_type="application/json", body="Error")
+    res = responses.add(responses.POST, url, status=201,
+                        content_type="application/json",
+                        json={'estimatedSeconds': 0.1})
+
+    client = FastPurgeClient(auth=client_auth, scheme='http', port=42)
+
+    future = client.purge_by_tag(['red'], network='staging')
+
+    assert future.result()
+    assert len(err_1.calls) == 1
+    assert len(err_2.calls) == 1
+    assert len(res.calls) == 1