Skip to content

Added ability to use LF, not only CRLF delimiter for response Headers and Body #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
1 change: 0 additions & 1 deletion h11/_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,6 @@ def next_event(self):
event = self._extract_next_receive_event()
if event not in [NEED_DATA, PAUSED]:
self._process_event(self.their_role, event)
self._receive_buffer.compress()
if event is NEED_DATA:
if len(self._receive_buffer) > self._max_incomplete_event_size:
# 431 is "Request header fields too large" which is pretty
Expand Down
2 changes: 1 addition & 1 deletion h11/_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def __call__(self, buf):
assert self._bytes_to_discard == 0
if self._bytes_in_chunk == 0:
# We need to refill our chunk count
chunk_header = buf.maybe_extract_until_next(b"\r\n")
chunk_header = buf.maybe_extract_next_line()
if chunk_header is None:
return None
matches = validate(
Expand Down
126 changes: 77 additions & 49 deletions h11/_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import re
import sys

__all__ = ["ReceiveBuffer"]


# Operations we want to support:
# - find next \r\n or \r\n\r\n, or wait until there is one
# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable),
# or wait until there is one
# - read at-most-N bytes
# Goals:
# - on average, do this fast
Expand Down Expand Up @@ -38,75 +40,101 @@
# slightly clever thing where we delay calling compress() until we've
# processed a whole event, which could in theory be slightly more efficient
# than the internal bytearray support.)

blank_line_regex = re.compile(b"\n\r?\n", re.MULTILINE)


class ReceiveBuffer(object):
def __init__(self):
self._data = bytearray()
# These are both absolute offsets into self._data:
self._start = 0
self._looked_at = 0
self._looked_for = b""
self._next_line_search = 0
self._multiple_lines_search = 0

def __iadd__(self, byteslike):
self._data += byteslike
return self

def __bool__(self):
return bool(len(self))

def __len__(self):
return len(self._data)

# for @property unprocessed_data
def __bytes__(self):
return bytes(self._data[self._start :])
return bytes(self._data)

if sys.version_info[0] < 3: # version specific: Python 2
__str__ = __bytes__
__nonzero__ = __bool__

def __len__(self):
return len(self._data) - self._start

def compress(self):
# Heuristic: only compress if it lets us reduce size by a factor
# of 2
if self._start > len(self._data) // 2:
del self._data[: self._start]
self._looked_at -= self._start
self._start -= self._start

def __iadd__(self, byteslike):
self._data += byteslike
return self

def maybe_extract_at_most(self, count):
out = self._data[self._start : self._start + count]
"""
Extract a fixed number of bytes from the buffer.
"""
out = self._data[:count]
if not out:
return None
self._start += len(out)

self._data[:count] = b""
self._next_line_search = 0
self._multiple_lines_search = 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a lot of copy/pastes of this code for extracting an initial slice and then doing internal bookkeeping, which is error prone. Let's factor it out into a method, like:

def _extract(self, count):
    out = self._data[:count]
    del self._data[:count]
    self._next_line_search = 0
    self._multiple_lines_search = 0

And then in all the other methods, just do:

    return self._extract(whatever_length_value_we_ended_up_with)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I introduced the _extract, and replaced all copy/pastes using this function.
It also helped to resolve the above comment (#115 (comment))

return out

def maybe_extract_until_next(self, needle):
# Returns extracted bytes on success (advancing offset), or None on
# failure
if self._looked_for == needle:
search_start = max(self._start, self._looked_at - len(needle) + 1)
else:
search_start = self._start
offset = self._data.find(needle, search_start)
if offset == -1:
self._looked_at = len(self._data)
self._looked_for = needle
def maybe_extract_next_line(self):
"""
Extract the first line, if it is completed in the buffer.
"""
# Only search in buffer space that we've not already looked at.
search_start_index = max(0, self._next_line_search - 1)
partial_buffer = self._data[search_start_index:]
partial_idx = partial_buffer.find(b"\r\n")
if partial_idx == -1:
self._next_line_search = len(self._data)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In maybe_extract_next_line, we store the raw buffer length in self._next_line_search and then do the subtraction when we use it. In maybe_extract_lines, we store the "pre-subtracted" value, so we can use it directly. This inconsistency is kind of confusing :-). We should switch one of them so they match.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As soon as I rewrote both methods using _extract(...) method, this issue resolved (both methods works with offsets)

return None
new_start = offset + len(needle)
out = self._data[self._start : new_start]
self._start = new_start

# Truncate the buffer and return it.
# + 2 is to compensate len(b"\r\n")
idx = search_start_index + partial_idx + 2
out = self._data[:idx]
self._data[:idx] = b""
self._next_line_search = 0
self._multiple_lines_search = 0
return out

# HTTP/1.1 has a number of constructs where you keep reading lines until
# you see a blank one. This does that, and then returns the lines.
def maybe_extract_lines(self):
if self._data[self._start : self._start + 2] == b"\r\n":
self._start += 2
"""
Extract everything up to the first blank line, and return a list of lines.
"""
# Handle the case where we have an immediate empty line.
if self._data[:1] == b"\n":
self._data[:1] = b""
self._next_line_search = 0
self._multiple_lines_search = 0
return []

if self._data[:2] == b"\r\n":
self._data[:2] = b""
self._next_line_search = 0
self._multiple_lines_search = 0
return []
else:
data = self.maybe_extract_until_next(b"\r\n\r\n")
if data is None:
return None
lines = data.split(b"\r\n")
assert lines[-2] == lines[-1] == b""
del lines[-2:]
return lines

# Only search in buffer space that we've not already looked at.
partial_buffer = self._data[self._multiple_lines_search :]
match = blank_line_regex.search(partial_buffer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of copying the buffer, use the pos argument to search:

match = blank_line_regex.search(self._data, self._multiple_lines_search)

if match is None:
self._multiple_lines_search = max(0, len(self._data) - 2)
return None

# Truncate the buffer and return it.
idx = self._multiple_lines_search + match.span(0)[-1]
out = self._data[:idx]
lines = [line.rstrip(b"\r") for line in out.split(b"\n")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of calling rstrip here (which always has to copy the whole buffer, since these are bytearrays), I think we could leave the trailing \r in, and then in _abnf.py change header_field, request_line, and status_line to match a trailing optional \r, e.g.:

header_field = (
    r"(?P<field_name>{field_name})"
    r":"
    r"{OWS}"
    r"(?P<field_value>{field_value})"
    r"{OWS}\r?".format(**globals())         # <-- notice added \r? here
)

Copy link
Contributor Author

@cdeler cdeler Dec 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to do that.

I added \r? to these regexes (header_field, and request_line, and status_line)

Then I set

lines = out.split(b"\n")

but it broke one of test_readers_unusual test cases in test_io.py

the test source
def test_readers_unusual():
    ...

    # obsolete line folding
    tr(
        READERS[CLIENT, IDLE],
        b"HEAD /foo HTTP/1.1\r\n"
        b"Host: example.com\r\n"
        b"Some: multi-line\r\n"
        b" header\r\n"
        b"\tnonsense\r\n"
        b"    \t   \t\tI guess\r\n"
        b"Connection: close\r\n"
        b"More-nonsense: in the\r\n"
        b"    last header  \r\n\r\n",
        Request(
            method="HEAD",
            target="/foo",
            headers=[
                ("Host", "example.com"),
                ("Some", "multi-line header nonsense I guess"),
                ("Connection", "close"),
                ("More-nonsense", "in the last header"),
            ],
        ),
    )

The header

        b"Some: multi-line\r\n"
        b" header\r\n"
        b"\tnonsense\r\n"
        b"    \t   \t\tI guess\r\n"

turns to

        b"Some: multi-line\r header\r\tnonsense\r    \t   \t\tI guess\r\n"

I cannot figure out how to carefully cut out the \r from such a line.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some discussion in gitter, it has been decided not to change regexes, but to rewrite .rstrip(...) with

        for line in lines:
            if line.endswith(b"\r"):
                del line[-1]

to avoid extra memory allocation


self._data[:idx] = b""
self._next_line_search = 0
self._multiple_lines_search = 0

assert lines[-2] == lines[-1] == b""

return lines[:-2]
92 changes: 74 additions & 18 deletions h11/tests/test_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import re

import pytest

from .._receivebuffer import ReceiveBuffer


Expand All @@ -12,15 +16,13 @@ def test_receivebuffer():
assert len(b) == 3
assert bytes(b) == b"123"

b.compress()
assert bytes(b) == b"123"

assert b.maybe_extract_at_most(2) == b"12"
assert b
assert len(b) == 1
assert bytes(b) == b"3"

b.compress()
assert bytes(b) == b"3"

assert b.maybe_extract_at_most(10) == b"3"
Expand All @@ -33,32 +35,35 @@ def test_receivebuffer():
# maybe_extract_until_next
################################################################

b += b"12345a6789aa"
b += b"123\n456\r\n789\r\n"

assert b.maybe_extract_until_next(b"a") == b"12345a"
assert bytes(b) == b"6789aa"
assert b.maybe_extract_next_line() == b"123\n456\r\n"
assert bytes(b) == b"789\r\n"

assert b.maybe_extract_until_next(b"aaa") is None
assert bytes(b) == b"6789aa"
assert b.maybe_extract_next_line() == b"789\r\n"
assert bytes(b) == b""

b += b"a12"
assert b.maybe_extract_until_next(b"aaa") == b"6789aaa"
assert bytes(b) == b"12"
b += b"12\r"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b"12\r"

# check repeated searches for the same needle, triggering the
# pickup-where-we-left-off logic
b += b"345"
assert b.maybe_extract_until_next(b"aaa") is None
b += b"345\n\r"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b"12\r345\n\r"

b += b"6789aaa123"
assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa"
assert bytes(b) == b"123"
# here we stopped at the middle of b"\r\n" delimiter

b += b"\n6789aaa123\r\n"
assert b.maybe_extract_next_line() == b"12\r345\n\r\n"
assert b.maybe_extract_next_line() == b"6789aaa123\r\n"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b""

################################################################
# maybe_extract_lines
################################################################

b += b"\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
b += b"123\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
lines = b.maybe_extract_lines()
assert lines == [b"123", b"a: b", b"foo:bar"]
assert bytes(b) == b"trailing"
Expand All @@ -76,3 +81,54 @@ def test_receivebuffer():
b += b"\r\ntrailing"
assert b.maybe_extract_lines() == []
assert bytes(b) == b"trailing"


@pytest.mark.parametrize(
"data",
[
pytest.param(
(
b"HTTP/1.1 200 OK\r\n",
b"Content-type: text/plain\r\n",
b"Connection: close\r\n",
b"\r\n",
b"Some body",
),
id="with_crlf_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_lf_only_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\r\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_mixed_crlf_and_lf",
),
],
)
def test_receivebuffer_for_invalid_delimiter(data):
b = ReceiveBuffer()

for line in data:
b += line

lines = b.maybe_extract_lines()

assert lines == [
b"HTTP/1.1 200 OK",
b"Content-type: text/plain",
b"Connection: close",
]
assert bytes(b) == b"Some body"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a few similar tests to test_readers_unusual in test_io.py? These tests are good for the basic ReceiveBuffer functionality, but the test harness there sets up a more "end-to-end" setup that runs our full http parsing pipeline, so it would give us more confidence that everything is wired up correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added similar tests to test_readers_unusual