Skip to content

Commit a790e40

Browse files
jhnstrkKludex
andauthored
Improve performance, especially in data with many CR-LF (#137)
* Improve parsing content with many cr-lf Drops the look-behind buffer since the content is always the boundary. * Improve performance by using built-in bytes.find. The Boyer-Moore-Horspool algorithm was removed and replaced with Python's built-in `find` method. This appears to be faster, sometimes by an order of magnitude. * Delete unused join_bytes --------- Co-authored-by: Marcelo Trylesinski <[email protected]>
1 parent dcf0ba1 commit a790e40

File tree

2 files changed

+89
-76
lines changed

2 files changed

+89
-76
lines changed

multipart/multipart.py

Lines changed: 61 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,6 @@ def ord_char(c: int) -> int:
146146
return c
147147

148148

149-
def join_bytes(b: bytes) -> bytes:
150-
return bytes(list(b))
151-
152-
153149
def parse_options_header(value: str | bytes) -> tuple[bytes, dict[bytes, bytes]]:
154150
"""Parses a Content-Type header into a value in the following format: (content_type, {parameters})."""
155151
# Uses email.message.Message to parse the header as described in PEP 594.
@@ -976,29 +972,11 @@ def __init__(
976972
# Setup marks. These are used to track the state of data received.
977973
self.marks: dict[str, int] = {}
978974

979-
# TODO: Actually use this rather than the dumb version we currently use
980-
# # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
981-
# skip = [len(boundary) for x in range(256)]
982-
# for i in range(len(boundary) - 1):
983-
# skip[ord_char(boundary[i])] = len(boundary) - i - 1
984-
#
985-
# # We use a tuple since it's a constant, and marginally faster.
986-
# self.skip = tuple(skip)
987-
988975
# Save our boundary.
989976
if isinstance(boundary, str): # pragma: no cover
990977
boundary = boundary.encode("latin-1")
991978
self.boundary = b"\r\n--" + boundary
992979

993-
# Get a set of characters that belong to our boundary.
994-
self.boundary_chars = frozenset(self.boundary)
995-
996-
# We also create a lookbehind list.
997-
# Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
998-
# "--\r\n" at the final boundary, and the length of '\r\n--' and
999-
# '--\r\n' is 8 bytes.
1000-
self.lookbehind = [NULL for _ in range(len(boundary) + 8)]
1001-
1002980
def write(self, data: bytes) -> int:
1003981
"""Write some data to the parser, which will perform size verification,
1004982
and then parse the data into the appropriate location (e.g. header,
@@ -1061,21 +1039,43 @@ def delete_mark(name: str, reset: bool = False) -> None:
10611039
# end of the buffer, and reset the mark, instead of deleting it. This
10621040
# is used at the end of the function to call our callbacks with any
10631041
# remaining data in this chunk.
1064-
def data_callback(name: str, remaining: bool = False) -> None:
1042+
def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
10651043
marked_index = self.marks.get(name)
10661044
if marked_index is None:
10671045
return
10681046

1069-
# If we're getting remaining data, we ignore the current i value
1070-
# and just call with the remaining data.
1071-
if remaining:
1072-
self.callback(name, data, marked_index, length)
1073-
self.marks[name] = 0
1074-
10751047
# Otherwise, we call it from the mark to the current byte we're
10761048
# processing.
1049+
if end_i <= marked_index:
1050+
# There is no additional data to send.
1051+
pass
1052+
elif marked_index >= 0:
1053+
# We are emitting data from the local buffer.
1054+
self.callback(name, data, marked_index, end_i)
1055+
else:
1056+
# Some of the data comes from a partial boundary match.
1057+
# and requires look-behind.
1058+
# We need to use self.flags (and not flags) because we care about
1059+
# the state when we entered the loop.
1060+
lookbehind_len = -marked_index
1061+
if lookbehind_len <= len(boundary):
1062+
self.callback(name, boundary, 0, lookbehind_len)
1063+
elif self.flags & FLAG_PART_BOUNDARY:
1064+
lookback = boundary + b"\r\n"
1065+
self.callback(name, lookback, 0, lookbehind_len)
1066+
elif self.flags & FLAG_LAST_BOUNDARY:
1067+
lookback = boundary + b"--\r\n"
1068+
self.callback(name, lookback, 0, lookbehind_len)
1069+
else: # pragma: no cover (error case)
1070+
self.logger.warning("Look-back buffer error")
1071+
1072+
if end_i > 0:
1073+
self.callback(name, data, 0, end_i)
1074+
# If we're getting remaining data, we have got all the data we
1075+
# can be certain is not a boundary, leaving only a partial boundary match.
1076+
if remaining:
1077+
self.marks[name] = end_i - length
10771078
else:
1078-
self.callback(name, data, marked_index, i)
10791079
self.marks.pop(name, None)
10801080

10811081
# For each byte...
@@ -1183,7 +1183,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
11831183
raise e
11841184

11851185
# Call our callback with the header field.
1186-
data_callback("header_field")
1186+
data_callback("header_field", i)
11871187

11881188
# Move to parsing the header value.
11891189
state = MultipartState.HEADER_VALUE_START
@@ -1212,7 +1212,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
12121212
# If we've got a CR, we're nearly done our headers. Otherwise,
12131213
# we do nothing and just move past this character.
12141214
if c == CR:
1215-
data_callback("header_value")
1215+
data_callback("header_value", i)
12161216
self.callback("header_end")
12171217
state = MultipartState.HEADER_VALUE_ALMOST_DONE
12181218

@@ -1256,46 +1256,46 @@ def data_callback(name: str, remaining: bool = False) -> None:
12561256
# We're processing our part data right now. During this, we
12571257
# need to efficiently search for our boundary, since any data
12581258
# on any number of lines can be a part of the current data.
1259-
# We use the Boyer-Moore-Horspool algorithm to efficiently
1260-
# search through the remainder of the buffer looking for our
1261-
# boundary.
12621259

12631260
# Save the current value of our index. We use this in case we
12641261
# find part of a boundary, but it doesn't match fully.
12651262
prev_index = index
12661263

12671264
# Set up variables.
12681265
boundary_length = len(boundary)
1269-
boundary_end = boundary_length - 1
12701266
data_length = length
1271-
boundary_chars = self.boundary_chars
12721267

12731268
# If our index is 0, we're starting a new part, so start our
12741269
# search.
12751270
if index == 0:
1276-
# Search forward until we either hit the end of our buffer,
1277-
# or reach a character that's in our boundary.
1278-
i += boundary_end
1279-
while i < data_length - 1 and data[i] not in boundary_chars:
1280-
i += boundary_length
1281-
1282-
# Reset i back the length of our boundary, which is the
1283-
# earliest possible location that could be our match (i.e.
1284-
# if we've just broken out of our loop since we saw the
1285-
# last character in our boundary)
1286-
i -= boundary_end
1271+
# The most common case is likely to be that the whole
1272+
# boundary is present in the buffer.
1273+
# Calling `find` is much faster than iterating here.
1274+
i0 = data.find(boundary, i, data_length)
1275+
if i0 >= 0:
1276+
# We matched the whole boundary string.
1277+
index = boundary_length - 1
1278+
i = i0 + boundary_length - 1
1279+
else:
1280+
# No match found for whole string.
1281+
# There may be a partial boundary at the end of the
1282+
# data, which the find will not match.
1283+
# Since the length should to be searched is limited to
1284+
# the boundary length, just perform a naive search.
1285+
i = max(i, data_length - boundary_length)
1286+
1287+
# Search forward until we either hit the end of our buffer,
1288+
# or reach a potential start of the boundary.
1289+
while i < data_length - 1 and data[i] != boundary[0]:
1290+
i += 1
1291+
12871292
c = data[i]
12881293

12891294
# Now, we have a couple of cases here. If our index is before
12901295
# the end of the boundary...
12911296
if index < boundary_length:
12921297
# If the character matches...
12931298
if boundary[index] == c:
1294-
# If we found a match for our boundary, we send the
1295-
# existing data.
1296-
if index == 0:
1297-
data_callback("part_data")
1298-
12991299
# The current character matches, so continue!
13001300
index += 1
13011301
else:
@@ -1332,6 +1332,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
13321332
# Unset the part boundary flag.
13331333
flags &= ~FLAG_PART_BOUNDARY
13341334

1335+
# We have identified a boundary, callback for any data before it.
1336+
data_callback("part_data", i - index)
13351337
# Callback indicating that we've reached the end of
13361338
# a part, and are starting a new one.
13371339
self.callback("part_end")
@@ -1353,6 +1355,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
13531355
elif flags & FLAG_LAST_BOUNDARY:
13541356
# We need a second hyphen here.
13551357
if c == HYPHEN:
1358+
# We have identified a boundary, callback for any data before it.
1359+
data_callback("part_data", i - index)
13561360
# Callback to end the current part, and then the
13571361
# message.
13581362
self.callback("part_end")
@@ -1362,26 +1366,14 @@ def data_callback(name: str, remaining: bool = False) -> None:
13621366
# No match, so reset index.
13631367
index = 0
13641368

1365-
# If we have an index, we need to keep this byte for later, in
1366-
# case we can't match the full boundary.
1367-
if index > 0:
1368-
self.lookbehind[index - 1] = c
1369-
13701369
# Otherwise, our index is 0. If the previous index is not, it
13711370
# means we reset something, and we need to take the data we
13721371
# thought was part of our boundary and send it along as actual
13731372
# data.
1374-
elif prev_index > 0:
1375-
# Callback to write the saved data.
1376-
lb_data = join_bytes(self.lookbehind)
1377-
self.callback("part_data", lb_data, 0, prev_index)
1378-
1373+
if index == 0 and prev_index > 0:
13791374
# Overwrite our previous index.
13801375
prev_index = 0
13811376

1382-
# Re-set our mark for part data.
1383-
set_mark("part_data")
1384-
13851377
# Re-consider the current character, since this could be
13861378
# the start of the boundary itself.
13871379
i -= 1
@@ -1410,9 +1402,9 @@ def data_callback(name: str, remaining: bool = False) -> None:
14101402
# that we haven't yet reached the end of this 'thing'. So, by setting
14111403
# the mark to 0, we cause any data callbacks that take place in future
14121404
# calls to this function to start from the beginning of that buffer.
1413-
data_callback("header_field", True)
1414-
data_callback("header_value", True)
1415-
data_callback("part_data", True)
1405+
data_callback("header_field", length, True)
1406+
data_callback("header_value", length, True)
1407+
data_callback("part_data", length - index, True)
14161408

14171409
# Save values to locals.
14181410
self.state = state

tests/test_multipart.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,14 @@ def test_not_aligned(self):
695695

696696
http_tests.append({"name": fname, "test": test_data, "result": yaml_data})
697697

698+
# Datasets used for single-byte writing test.
699+
single_byte_tests = [
700+
"almost_match_boundary",
701+
"almost_match_boundary_without_CR",
702+
"almost_match_boundary_without_LF",
703+
"almost_match_boundary_without_final_hyphen",
704+
"single_field_single_file",
705+
]
698706

699707
def split_all(val):
700708
"""
@@ -843,17 +851,19 @@ def test_random_splitting(self):
843851
self.assert_field(b"field", b"test1")
844852
self.assert_file(b"file", b"file.txt", b"test2")
845853

846-
def test_feed_single_bytes(self):
854+
@parametrize("param", [ t for t in http_tests if t["name"] in single_byte_tests])
855+
def test_feed_single_bytes(self, param):
847856
"""
848-
This test parses a simple multipart body 1 byte at a time.
857+
This test parses multipart bodies 1 byte at a time.
849858
"""
850859
# Load test data.
851-
test_file = "single_field_single_file.http"
860+
test_file = param["name"] + ".http"
861+
boundary = param["result"]["boundary"]
852862
with open(os.path.join(http_tests_dir, test_file), "rb") as f:
853863
test_data = f.read()
854864

855865
# Create form parser.
856-
self.make("boundary")
866+
self.make(boundary)
857867

858868
# Write all bytes.
859869
# NOTE: Can't simply do `for b in test_data`, since that gives
@@ -868,9 +878,20 @@ def test_feed_single_bytes(self):
868878
# Assert we processed everything.
869879
self.assertEqual(i, len(test_data))
870880

871-
# Assert that our file and field are here.
872-
self.assert_field(b"field", b"test1")
873-
self.assert_file(b"file", b"file.txt", b"test2")
881+
# Assert that the parser gave us the appropriate fields/files.
882+
for e in param["result"]["expected"]:
883+
# Get our type and name.
884+
type = e["type"]
885+
name = e["name"].encode("latin-1")
886+
887+
if type == "field":
888+
self.assert_field(name, e["data"])
889+
890+
elif type == "file":
891+
self.assert_file(name, e["file_name"].encode("latin-1"), e["data"])
892+
893+
else:
894+
assert False
874895

875896
def test_feed_blocks(self):
876897
"""

0 commit comments

Comments
 (0)