python-hyper · pgjones · Dec 26, 2020 · Nov 6, 2020 · Nov 7, 2020 · Nov 19, 2020
diff --git a/h11/_connection.py b/h11/_connection.py
@@ -425,7 +425,6 @@ def next_event(self):
             event = self._extract_next_receive_event()
             if event not in [NEED_DATA, PAUSED]:
                 self._process_event(self.their_role, event)
-                self._receive_buffer.compress()
             if event is NEED_DATA:
                 if len(self._receive_buffer) > self._max_incomplete_event_size:
                     # 431 is "Request header fields too large" which is pretty

diff --git a/h11/_readers.py b/h11/_readers.py
@@ -153,7 +153,7 @@ def __call__(self, buf):
         assert self._bytes_to_discard == 0
         if self._bytes_in_chunk == 0:
             # We need to refill our chunk count
-            chunk_header = buf.maybe_extract_until_next(b"\r\n")
+            chunk_header = buf.maybe_extract_next_line()
             if chunk_header is None:
                 return None
             matches = validate(

diff --git a/h11/_receivebuffer.py b/h11/_receivebuffer.py
@@ -1,10 +1,12 @@
+import re
 import sys
 
 __all__ = ["ReceiveBuffer"]
 
 
 # Operations we want to support:
-# - find next \r\n or \r\n\r\n, or wait until there is one
+# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable),
+#   or wait until there is one
 # - read at-most-N bytes
 # Goals:
 # - on average, do this fast
@@ -38,75 +40,101 @@
 # slightly clever thing where we delay calling compress() until we've
 # processed a whole event, which could in theory be slightly more efficient
 # than the internal bytearray support.)
+
+blank_line_regex = re.compile(b"\n\r?\n", re.MULTILINE)
+
+
 class ReceiveBuffer(object):
     def __init__(self):
         self._data = bytearray()
-        # These are both absolute offsets into self._data:
-        self._start = 0
-        self._looked_at = 0
-        self._looked_for = b""
+        self._next_line_search = 0
+        self._multiple_lines_search = 0
+
+    def __iadd__(self, byteslike):
+        self._data += byteslike
+        return self
 
     def __bool__(self):
         return bool(len(self))
 
+    def __len__(self):
+        return len(self._data)
+
     # for @property unprocessed_data
     def __bytes__(self):
-        return bytes(self._data[self._start :])
+        return bytes(self._data)
 
     if sys.version_info[0] < 3:  # version specific: Python 2
         __str__ = __bytes__
         __nonzero__ = __bool__
 
-    def __len__(self):
-        return len(self._data) - self._start
-
-    def compress(self):
-        # Heuristic: only compress if it lets us reduce size by a factor
-        # of 2
-        if self._start > len(self._data) // 2:
-            del self._data[: self._start]
-            self._looked_at -= self._start
-            self._start -= self._start
-
-    def __iadd__(self, byteslike):
-        self._data += byteslike
-        return self
-
     def maybe_extract_at_most(self, count):
-        out = self._data[self._start : self._start + count]
+        """
+        Extract a fixed number of bytes from the buffer.
+        """
+        out = self._data[:count]
         if not out:
             return None
-        self._start += len(out)
+
+        self._data[:count] = b""
+        self._next_line_search = 0
+        self._multiple_lines_search = 0
         return out
 
-    def maybe_extract_until_next(self, needle):
-        # Returns extracted bytes on success (advancing offset), or None on
-        # failure
-        if self._looked_for == needle:
-            search_start = max(self._start, self._looked_at - len(needle) + 1)
-        else:
-            search_start = self._start
-        offset = self._data.find(needle, search_start)
-        if offset == -1:
-            self._looked_at = len(self._data)
-            self._looked_for = needle
+    def maybe_extract_next_line(self):
+        """
+        Extract the first line, if it is completed in the buffer.
+        """
+        # Only search in buffer space that we've not already looked at.
+        search_start_index = max(0, self._next_line_search - 1)
+        partial_buffer = self._data[search_start_index:]
+        partial_idx = partial_buffer.find(b"\r\n")
+        if partial_idx == -1:
+            self._next_line_search = len(self._data)
             return None
-        new_start = offset + len(needle)
-        out = self._data[self._start : new_start]
-        self._start = new_start
+
+        # Truncate the buffer and return it.
+        # + 2 is to compensate len(b"\r\n")
+        idx = search_start_index + partial_idx + 2
+        out = self._data[:idx]
+        self._data[:idx] = b""
+        self._next_line_search = 0
+        self._multiple_lines_search = 0
         return out
 
-    # HTTP/1.1 has a number of constructs where you keep reading lines until
-    # you see a blank one. This does that, and then returns the lines.
     def maybe_extract_lines(self):
-        if self._data[self._start : self._start + 2] == b"\r\n":
-            self._start += 2
+        """
+        Extract everything up to the first blank line, and return a list of lines.
+        """
+        # Handle the case where we have an immediate empty line.
+        if self._data[:1] == b"\n":
+            self._data[:1] = b""
+            self._next_line_search = 0
+            self._multiple_lines_search = 0
+            return []
+
+        if self._data[:2] == b"\r\n":
+            self._data[:2] = b""
+            self._next_line_search = 0
+            self._multiple_lines_search = 0
             return []
-        else:
-            data = self.maybe_extract_until_next(b"\r\n\r\n")
-            if data is None:
-                return None
-            lines = data.split(b"\r\n")
-            assert lines[-2] == lines[-1] == b""
-            del lines[-2:]
-            return lines
+
+        # Only search in buffer space that we've not already looked at.
+        partial_buffer = self._data[self._multiple_lines_search :]
+        match = blank_line_regex.search(partial_buffer)
+        if match is None:
+            self._multiple_lines_search = max(0, len(self._data) - 2)
+            return None
+
+        # Truncate the buffer and return it.
+        idx = self._multiple_lines_search + match.span(0)[-1]
+        out = self._data[:idx]
+        lines = [line.rstrip(b"\r") for line in out.split(b"\n")]
+
+        self._data[:idx] = b""
+        self._next_line_search = 0
+        self._multiple_lines_search = 0
+
+        assert lines[-2] == lines[-1] == b""
+
+        return lines[:-2]
diff --git a/h11/tests/test_receivebuffer.py b/h11/tests/test_receivebuffer.py
@@ -1,3 +1,7 @@
+import re
+
+import pytest
+
 from .._receivebuffer import ReceiveBuffer
 
 
@@ -12,15 +16,13 @@ def test_receivebuffer():
     assert len(b) == 3
     assert bytes(b) == b"123"
 
-    b.compress()
     assert bytes(b) == b"123"
 
     assert b.maybe_extract_at_most(2) == b"12"
     assert b
     assert len(b) == 1
     assert bytes(b) == b"3"
 
-    b.compress()
     assert bytes(b) == b"3"
 
     assert b.maybe_extract_at_most(10) == b"3"
@@ -33,32 +35,35 @@ def test_receivebuffer():
     # maybe_extract_until_next
     ################################################################
 
-    b += b"12345a6789aa"
+    b += b"123\n456\r\n789\r\n"
 
-    assert b.maybe_extract_until_next(b"a") == b"12345a"
-    assert bytes(b) == b"6789aa"
+    assert b.maybe_extract_next_line() == b"123\n456\r\n"
+    assert bytes(b) == b"789\r\n"
 
-    assert b.maybe_extract_until_next(b"aaa") is None
-    assert bytes(b) == b"6789aa"
+    assert b.maybe_extract_next_line() == b"789\r\n"
+    assert bytes(b) == b""
 
-    b += b"a12"
-    assert b.maybe_extract_until_next(b"aaa") == b"6789aaa"
-    assert bytes(b) == b"12"
+    b += b"12\r"
+    assert b.maybe_extract_next_line() is None
+    assert bytes(b) == b"12\r"
 
-    # check repeated searches for the same needle, triggering the
-    # pickup-where-we-left-off logic
-    b += b"345"
-    assert b.maybe_extract_until_next(b"aaa") is None
+    b += b"345\n\r"
+    assert b.maybe_extract_next_line() is None
+    assert bytes(b) == b"12\r345\n\r"
 
-    b += b"6789aaa123"
-    assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa"
-    assert bytes(b) == b"123"
+    # here we stopped at the middle of b"\r\n" delimiter
+
+    b += b"\n6789aaa123\r\n"
+    assert b.maybe_extract_next_line() == b"12\r345\n\r\n"
+    assert b.maybe_extract_next_line() == b"6789aaa123\r\n"
+    assert b.maybe_extract_next_line() is None
+    assert bytes(b) == b""
 
     ################################################################
     # maybe_extract_lines
     ################################################################
 
-    b += b"\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
+    b += b"123\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
     lines = b.maybe_extract_lines()
     assert lines == [b"123", b"a: b", b"foo:bar"]
     assert bytes(b) == b"trailing"
@@ -76,3 +81,54 @@ def test_receivebuffer():
     b += b"\r\ntrailing"
     assert b.maybe_extract_lines() == []
     assert bytes(b) == b"trailing"
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        pytest.param(
+            (
+                b"HTTP/1.1 200 OK\r\n",
+                b"Content-type: text/plain\r\n",
+                b"Connection: close\r\n",
+                b"\r\n",
+                b"Some body",
+            ),
+            id="with_crlf_delimiter",
+        ),
+        pytest.param(
+            (
+                b"HTTP/1.1 200 OK\n",
+                b"Content-type: text/plain\n",
+                b"Connection: close\n",
+                b"\n",
+                b"Some body",
+            ),
+            id="with_lf_only_delimiter",
+        ),
+        pytest.param(
+            (
+                b"HTTP/1.1 200 OK\n",
+                b"Content-type: text/plain\r\n",
+                b"Connection: close\n",
+                b"\n",
+                b"Some body",
+            ),
+            id="with_mixed_crlf_and_lf",
+        ),
+    ],
+)
+def test_receivebuffer_for_invalid_delimiter(data):
+    b = ReceiveBuffer()
+
+    for line in data:
+        b += line
+
+    lines = b.maybe_extract_lines()
+
+    assert lines == [
+        b"HTTP/1.1 200 OK",
+        b"Content-type: text/plain",
+        b"Connection: close",
+    ]
+    assert bytes(b) == b"Some body"