Skip to content

Added ability to use LF, not only CRLF delimiter for response Headers and Body #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
1 change: 0 additions & 1 deletion h11/_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,6 @@ def next_event(self):
event = self._extract_next_receive_event()
if event not in [NEED_DATA, PAUSED]:
self._process_event(self.their_role, event)
self._receive_buffer.compress()
if event is NEED_DATA:
if len(self._receive_buffer) > self._max_incomplete_event_size:
# 431 is "Request header fields too large" which is pretty
Expand Down
2 changes: 1 addition & 1 deletion h11/_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def __call__(self, buf):
assert self._bytes_to_discard == 0
if self._bytes_in_chunk == 0:
# We need to refill our chunk count
chunk_header = buf.maybe_extract_until_next(b"\r\n")
chunk_header = buf.maybe_extract_next_line()
if chunk_header is None:
return None
matches = validate(
Expand Down
124 changes: 76 additions & 48 deletions h11/_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import re
import sys

__all__ = ["ReceiveBuffer"]


# Operations we want to support:
# - find next \r\n or \r\n\r\n, or wait until there is one
# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable),
# or wait until there is one
# - read at-most-N bytes
# Goals:
# - on average, do this fast
Expand Down Expand Up @@ -38,75 +40,101 @@
# slightly clever thing where we delay calling compress() until we've
# processed a whole event, which could in theory be slightly more efficient
# than the internal bytearray support.)

blank_line_regex = re.compile(b"\n\r?\n", re.MULTILINE)


class ReceiveBuffer(object):
def __init__(self):
self._data = bytearray()
# These are both absolute offsets into self._data:
self._start = 0
self._looked_at = 0
self._looked_for = b""
self._next_line_search = 0
self._multiple_lines_search = 0

def __iadd__(self, byteslike):
self._data += byteslike
return self

def __bool__(self):
return bool(len(self))

def __len__(self):
return len(self._data)

# for @property unprocessed_data
def __bytes__(self):
return bytes(self._data[self._start :])
return bytes(self._data)

if sys.version_info[0] < 3: # version specific: Python 2
__str__ = __bytes__
__nonzero__ = __bool__

def __len__(self):
return len(self._data) - self._start
def _extract(self, count):
# extracting an initial slice of the data buffer and return it
out = self._data[:count]
del self._data[:count]

def compress(self):
# Heuristic: only compress if it lets us reduce size by a factor
# of 2
if self._start > len(self._data) // 2:
del self._data[: self._start]
self._looked_at -= self._start
self._start -= self._start
self._next_line_search = 0
self._multiple_lines_search = 0

def __iadd__(self, byteslike):
self._data += byteslike
return self
return out

def maybe_extract_at_most(self, count):
out = self._data[self._start : self._start + count]
"""
Extract a fixed number of bytes from the buffer.
"""
out = self._data[:count]
if not out:
return None
self._start += len(out)
return out

def maybe_extract_until_next(self, needle):
# Returns extracted bytes on success (advancing offset), or None on
# failure
if self._looked_for == needle:
search_start = max(self._start, self._looked_at - len(needle) + 1)
else:
search_start = self._start
offset = self._data.find(needle, search_start)
if offset == -1:
self._looked_at = len(self._data)
self._looked_for = needle
return self._extract(count)

def maybe_extract_next_line(self):
"""
Extract the first line, if it is completed in the buffer.
"""
# Only search in buffer space that we've not already looked at.
search_start_index = max(0, self._next_line_search - 1)
partial_idx = self._data.find(b"\r\n", search_start_index)

if partial_idx == -1:
self._next_line_search = len(self._data)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In maybe_extract_next_line, we store the raw buffer length in self._next_line_search and then do the subtraction when we use it. In maybe_extract_lines, we store the "pre-subtracted" value, so we can use it directly. This inconsistency is kind of confusing :-). We should switch one of them so they match.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As soon as I rewrote both methods using _extract(...) method, this issue resolved (both methods works with offsets)

return None
new_start = offset + len(needle)
out = self._data[self._start : new_start]
self._start = new_start
return out

# HTTP/1.1 has a number of constructs where you keep reading lines until
# you see a blank one. This does that, and then returns the lines.
# + 2 is to compensate len(b"\r\n")
idx = partial_idx + 2

return self._extract(idx)

def maybe_extract_lines(self):
if self._data[self._start : self._start + 2] == b"\r\n":
self._start += 2
"""
Extract everything up to the first blank line, and return a list of lines.
"""
# Handle the case where we have an immediate empty line.
if self._data[:1] == b"\n":
self._extract(1)
return []

if self._data[:2] == b"\r\n":
self._extract(2)
return []
else:
data = self.maybe_extract_until_next(b"\r\n\r\n")
if data is None:
return None
lines = data.split(b"\r\n")
assert lines[-2] == lines[-1] == b""
del lines[-2:]
return lines

# Only search in buffer space that we've not already looked at.
match = blank_line_regex.search(self._data, self._multiple_lines_search)
if match is None:
self._multiple_lines_search = max(0, len(self._data) - 2)
return None

# Truncate the buffer and return it.
idx = match.span(0)[-1]
out = self._extract(idx)
lines = out.split(b"\n")

for line in lines:
if line.endswith(b"\r"):
del line[-1]

assert lines[-2] == lines[-1] == b""

del lines[-2:]

return lines
37 changes: 37 additions & 0 deletions h11/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,43 @@ def test_readers_unusual():
),
)

# Tolerate headers line endings (\r\n and \n)
# \n\r\b between headers and body
tr(
READERS[SERVER, SEND_RESPONSE],
b"HTTP/1.1 200 OK\r\nSomeHeader: val\n\r\n",
Response(
status_code=200,
headers=[("SomeHeader", "val")],
http_version="1.1",
reason="OK",
),
)

# delimited only with \n
tr(
READERS[SERVER, SEND_RESPONSE],
b"HTTP/1.1 200 OK\nSomeHeader1: val1\nSomeHeader2: val2\n\n",
Response(
status_code=200,
headers=[("SomeHeader1", "val1"), ("SomeHeader2", "val2")],
http_version="1.1",
reason="OK",
),
)

# mixed \r\n and \n
tr(
READERS[SERVER, SEND_RESPONSE],
b"HTTP/1.1 200 OK\r\nSomeHeader1: val1\nSomeHeader2: val2\n\r\n",
Response(
status_code=200,
headers=[("SomeHeader1", "val1"), ("SomeHeader2", "val2")],
http_version="1.1",
reason="OK",
),
)

# obsolete line folding
tr(
READERS[CLIENT, IDLE],
Expand Down
92 changes: 74 additions & 18 deletions h11/tests/test_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import re

import pytest

from .._receivebuffer import ReceiveBuffer


Expand All @@ -12,15 +16,13 @@ def test_receivebuffer():
assert len(b) == 3
assert bytes(b) == b"123"

b.compress()
assert bytes(b) == b"123"

assert b.maybe_extract_at_most(2) == b"12"
assert b
assert len(b) == 1
assert bytes(b) == b"3"

b.compress()
assert bytes(b) == b"3"

assert b.maybe_extract_at_most(10) == b"3"
Expand All @@ -33,32 +35,35 @@ def test_receivebuffer():
# maybe_extract_until_next
################################################################

b += b"12345a6789aa"
b += b"123\n456\r\n789\r\n"

assert b.maybe_extract_until_next(b"a") == b"12345a"
assert bytes(b) == b"6789aa"
assert b.maybe_extract_next_line() == b"123\n456\r\n"
assert bytes(b) == b"789\r\n"

assert b.maybe_extract_until_next(b"aaa") is None
assert bytes(b) == b"6789aa"
assert b.maybe_extract_next_line() == b"789\r\n"
assert bytes(b) == b""

b += b"a12"
assert b.maybe_extract_until_next(b"aaa") == b"6789aaa"
assert bytes(b) == b"12"
b += b"12\r"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b"12\r"

# check repeated searches for the same needle, triggering the
# pickup-where-we-left-off logic
b += b"345"
assert b.maybe_extract_until_next(b"aaa") is None
b += b"345\n\r"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b"12\r345\n\r"

b += b"6789aaa123"
assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa"
assert bytes(b) == b"123"
# here we stopped at the middle of b"\r\n" delimiter

b += b"\n6789aaa123\r\n"
assert b.maybe_extract_next_line() == b"12\r345\n\r\n"
assert b.maybe_extract_next_line() == b"6789aaa123\r\n"
assert b.maybe_extract_next_line() is None
assert bytes(b) == b""

################################################################
# maybe_extract_lines
################################################################

b += b"\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
b += b"123\r\na: b\r\nfoo:bar\r\n\r\ntrailing"
lines = b.maybe_extract_lines()
assert lines == [b"123", b"a: b", b"foo:bar"]
assert bytes(b) == b"trailing"
Expand All @@ -76,3 +81,54 @@ def test_receivebuffer():
b += b"\r\ntrailing"
assert b.maybe_extract_lines() == []
assert bytes(b) == b"trailing"


@pytest.mark.parametrize(
"data",
[
pytest.param(
(
b"HTTP/1.1 200 OK\r\n",
b"Content-type: text/plain\r\n",
b"Connection: close\r\n",
b"\r\n",
b"Some body",
),
id="with_crlf_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_lf_only_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\r\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_mixed_crlf_and_lf",
),
],
)
def test_receivebuffer_for_invalid_delimiter(data):
b = ReceiveBuffer()

for line in data:
b += line

lines = b.maybe_extract_lines()

assert lines == [
b"HTTP/1.1 200 OK",
b"Content-type: text/plain",
b"Connection: close",
]
assert bytes(b) == b"Some body"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a few similar tests to test_readers_unusual in test_io.py? These tests are good for the basic ReceiveBuffer functionality, but the test harness there sets up a more "end-to-end" setup that runs our full http parsing pipeline, so it would give us more confidence that everything is wired up correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added similar tests to test_readers_unusual

3 changes: 3 additions & 0 deletions newsfragments/7.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Added support for servers with broken line endings.

After this change h11 accepts both ``\r\n`` and ``\n`` as a headers delimiter.