-
Notifications
You must be signed in to change notification settings - Fork 77
Added ability to use LF, not only CRLF delimiter for response Headers and Body #115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
fda65a2
d3b2ef2
b2f70e9
ce26dc7
af7b481
341f632
74693fc
489de4d
ef27d89
a94d728
8aa9e7f
1aefea0
f688615
ca534f7
a1197d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
import re | ||
import sys | ||
|
||
__all__ = ["ReceiveBuffer"] | ||
|
||
|
||
# Operations we want to support: | ||
# - find next \r\n or \r\n\r\n, or wait until there is one | ||
# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable), | ||
# or wait until there is one | ||
# - read at-most-N bytes | ||
# Goals: | ||
# - on average, do this fast | ||
|
@@ -38,75 +40,101 @@ | |
# slightly clever thing where we delay calling compress() until we've | ||
# processed a whole event, which could in theory be slightly more efficient | ||
# than the internal bytearray support.) | ||
|
||
blank_line_regex = re.compile(b"\n\r?\n", re.MULTILINE) | ||
|
||
|
||
class ReceiveBuffer(object): | ||
def __init__(self): | ||
self._data = bytearray() | ||
# These are both absolute offsets into self._data: | ||
self._start = 0 | ||
self._looked_at = 0 | ||
self._looked_for = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
|
||
def __iadd__(self, byteslike): | ||
self._data += byteslike | ||
return self | ||
|
||
def __bool__(self): | ||
return bool(len(self)) | ||
|
||
def __len__(self): | ||
return len(self._data) | ||
|
||
# for @property unprocessed_data | ||
def __bytes__(self): | ||
return bytes(self._data[self._start :]) | ||
return bytes(self._data) | ||
|
||
if sys.version_info[0] < 3: # version specific: Python 2 | ||
__str__ = __bytes__ | ||
__nonzero__ = __bool__ | ||
|
||
def __len__(self): | ||
return len(self._data) - self._start | ||
|
||
def compress(self): | ||
# Heuristic: only compress if it lets us reduce size by a factor | ||
# of 2 | ||
if self._start > len(self._data) // 2: | ||
del self._data[: self._start] | ||
self._looked_at -= self._start | ||
self._start -= self._start | ||
|
||
def __iadd__(self, byteslike): | ||
self._data += byteslike | ||
return self | ||
|
||
def maybe_extract_at_most(self, count): | ||
out = self._data[self._start : self._start + count] | ||
""" | ||
Extract a fixed number of bytes from the buffer. | ||
""" | ||
out = self._data[:count] | ||
if not out: | ||
return None | ||
self._start += len(out) | ||
|
||
self._data[:count] = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
return out | ||
|
||
def maybe_extract_until_next(self, needle): | ||
# Returns extracted bytes on success (advancing offset), or None on | ||
# failure | ||
if self._looked_for == needle: | ||
search_start = max(self._start, self._looked_at - len(needle) + 1) | ||
else: | ||
search_start = self._start | ||
offset = self._data.find(needle, search_start) | ||
if offset == -1: | ||
self._looked_at = len(self._data) | ||
self._looked_for = needle | ||
def maybe_extract_next_line(self): | ||
""" | ||
Extract the first line, if it is completed in the buffer. | ||
""" | ||
# Only search in buffer space that we've not already looked at. | ||
search_start_index = max(0, self._next_line_search - 1) | ||
partial_buffer = self._data[search_start_index:] | ||
partial_idx = partial_buffer.find(b"\r\n") | ||
if partial_idx == -1: | ||
self._next_line_search = len(self._data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As soon as I rewrote both methods using |
||
return None | ||
new_start = offset + len(needle) | ||
out = self._data[self._start : new_start] | ||
self._start = new_start | ||
|
||
# Truncate the buffer and return it. | ||
# + 2 is to compensate len(b"\r\n") | ||
idx = search_start_index + partial_idx + 2 | ||
out = self._data[:idx] | ||
self._data[:idx] = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
return out | ||
|
||
# HTTP/1.1 has a number of constructs where you keep reading lines until | ||
# you see a blank one. This does that, and then returns the lines. | ||
def maybe_extract_lines(self): | ||
if self._data[self._start : self._start + 2] == b"\r\n": | ||
self._start += 2 | ||
""" | ||
Extract everything up to the first blank line, and return a list of lines. | ||
""" | ||
# Handle the case where we have an immediate empty line. | ||
if self._data[:1] == b"\n": | ||
self._data[:1] = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
return [] | ||
|
||
if self._data[:2] == b"\r\n": | ||
self._data[:2] = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
return [] | ||
else: | ||
data = self.maybe_extract_until_next(b"\r\n\r\n") | ||
if data is None: | ||
return None | ||
lines = data.split(b"\r\n") | ||
assert lines[-2] == lines[-1] == b"" | ||
del lines[-2:] | ||
return lines | ||
|
||
# Only search in buffer space that we've not already looked at. | ||
partial_buffer = self._data[self._multiple_lines_search :] | ||
match = blank_line_regex.search(partial_buffer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of copying the buffer, use the match = blank_line_regex.search(self._data, self._multiple_lines_search) |
||
if match is None: | ||
self._multiple_lines_search = max(0, len(self._data) - 2) | ||
return None | ||
|
||
# Truncate the buffer and return it. | ||
idx = self._multiple_lines_search + match.span(0)[-1] | ||
out = self._data[:idx] | ||
lines = [line.rstrip(b"\r") for line in out.split(b"\n")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of calling header_field = (
r"(?P<field_name>{field_name})"
r":"
r"{OWS}"
r"(?P<field_value>{field_value})"
r"{OWS}\r?".format(**globals()) # <-- notice added \r? here
) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to do that. I added Then I set lines = out.split(b"\n") but it broke one of the test sourcedef test_readers_unusual():
...
# obsolete line folding
tr(
READERS[CLIENT, IDLE],
b"HEAD /foo HTTP/1.1\r\n"
b"Host: example.com\r\n"
b"Some: multi-line\r\n"
b" header\r\n"
b"\tnonsense\r\n"
b" \t \t\tI guess\r\n"
b"Connection: close\r\n"
b"More-nonsense: in the\r\n"
b" last header \r\n\r\n",
Request(
method="HEAD",
target="/foo",
headers=[
("Host", "example.com"),
("Some", "multi-line header nonsense I guess"),
("Connection", "close"),
("More-nonsense", "in the last header"),
],
),
) The header b"Some: multi-line\r\n"
b" header\r\n"
b"\tnonsense\r\n"
b" \t \t\tI guess\r\n" turns to b"Some: multi-line\r header\r\tnonsense\r \t \t\tI guess\r\n" I cannot figure out how to carefully cut out the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After some discussion in gitter, it has been decided not to change regexes, but to rewrite for line in lines:
if line.endswith(b"\r"):
del line[-1] to avoid extra memory allocation |
||
|
||
self._data[:idx] = b"" | ||
self._next_line_search = 0 | ||
self._multiple_lines_search = 0 | ||
|
||
assert lines[-2] == lines[-1] == b"" | ||
|
||
return lines[:-2] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
import re | ||
|
||
import pytest | ||
|
||
from .._receivebuffer import ReceiveBuffer | ||
|
||
|
||
|
@@ -12,15 +16,13 @@ def test_receivebuffer(): | |
assert len(b) == 3 | ||
assert bytes(b) == b"123" | ||
|
||
b.compress() | ||
assert bytes(b) == b"123" | ||
|
||
assert b.maybe_extract_at_most(2) == b"12" | ||
assert b | ||
assert len(b) == 1 | ||
assert bytes(b) == b"3" | ||
|
||
b.compress() | ||
assert bytes(b) == b"3" | ||
|
||
assert b.maybe_extract_at_most(10) == b"3" | ||
|
@@ -33,32 +35,35 @@ def test_receivebuffer(): | |
# maybe_extract_until_next | ||
################################################################ | ||
|
||
b += b"12345a6789aa" | ||
b += b"123\n456\r\n789\r\n" | ||
|
||
assert b.maybe_extract_until_next(b"a") == b"12345a" | ||
assert bytes(b) == b"6789aa" | ||
assert b.maybe_extract_next_line() == b"123\n456\r\n" | ||
assert bytes(b) == b"789\r\n" | ||
|
||
assert b.maybe_extract_until_next(b"aaa") is None | ||
assert bytes(b) == b"6789aa" | ||
assert b.maybe_extract_next_line() == b"789\r\n" | ||
assert bytes(b) == b"" | ||
|
||
b += b"a12" | ||
assert b.maybe_extract_until_next(b"aaa") == b"6789aaa" | ||
assert bytes(b) == b"12" | ||
b += b"12\r" | ||
assert b.maybe_extract_next_line() is None | ||
assert bytes(b) == b"12\r" | ||
|
||
# check repeated searches for the same needle, triggering the | ||
# pickup-where-we-left-off logic | ||
b += b"345" | ||
assert b.maybe_extract_until_next(b"aaa") is None | ||
b += b"345\n\r" | ||
assert b.maybe_extract_next_line() is None | ||
assert bytes(b) == b"12\r345\n\r" | ||
|
||
b += b"6789aaa123" | ||
assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa" | ||
assert bytes(b) == b"123" | ||
# here we stopped at the middle of b"\r\n" delimiter | ||
|
||
b += b"\n6789aaa123\r\n" | ||
assert b.maybe_extract_next_line() == b"12\r345\n\r\n" | ||
assert b.maybe_extract_next_line() == b"6789aaa123\r\n" | ||
assert b.maybe_extract_next_line() is None | ||
assert bytes(b) == b"" | ||
|
||
################################################################ | ||
# maybe_extract_lines | ||
################################################################ | ||
|
||
b += b"\r\na: b\r\nfoo:bar\r\n\r\ntrailing" | ||
b += b"123\r\na: b\r\nfoo:bar\r\n\r\ntrailing" | ||
lines = b.maybe_extract_lines() | ||
assert lines == [b"123", b"a: b", b"foo:bar"] | ||
assert bytes(b) == b"trailing" | ||
|
@@ -76,3 +81,54 @@ def test_receivebuffer(): | |
b += b"\r\ntrailing" | ||
assert b.maybe_extract_lines() == [] | ||
assert bytes(b) == b"trailing" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"data", | ||
[ | ||
pytest.param( | ||
( | ||
b"HTTP/1.1 200 OK\r\n", | ||
b"Content-type: text/plain\r\n", | ||
b"Connection: close\r\n", | ||
b"\r\n", | ||
b"Some body", | ||
), | ||
id="with_crlf_delimiter", | ||
), | ||
pytest.param( | ||
( | ||
b"HTTP/1.1 200 OK\n", | ||
b"Content-type: text/plain\n", | ||
b"Connection: close\n", | ||
b"\n", | ||
b"Some body", | ||
), | ||
id="with_lf_only_delimiter", | ||
), | ||
pytest.param( | ||
( | ||
b"HTTP/1.1 200 OK\n", | ||
b"Content-type: text/plain\r\n", | ||
b"Connection: close\n", | ||
b"\n", | ||
b"Some body", | ||
), | ||
id="with_mixed_crlf_and_lf", | ||
), | ||
], | ||
) | ||
def test_receivebuffer_for_invalid_delimiter(data): | ||
b = ReceiveBuffer() | ||
|
||
for line in data: | ||
b += line | ||
|
||
lines = b.maybe_extract_lines() | ||
|
||
assert lines == [ | ||
b"HTTP/1.1 200 OK", | ||
b"Content-type: text/plain", | ||
b"Connection: close", | ||
] | ||
assert bytes(b) == b"Some body" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also add a few similar tests to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added similar tests to |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a lot of copy/pastes of this code for extracting an initial slice and then doing internal bookkeeping, which is error prone. Let's factor it out into a method, like:
And then in all the other methods, just do:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I introduced the
_extract
, and replaced all copy/pastes using this function.It also helped to resolve the above comment (#115 (comment))