@@ -146,10 +146,6 @@ def ord_char(c: int) -> int:
146
146
return c
147
147
148
148
149
- def join_bytes (b : bytes ) -> bytes :
150
- return bytes (list (b ))
151
-
152
-
153
149
def parse_options_header (value : str | bytes ) -> tuple [bytes , dict [bytes , bytes ]]:
154
150
"""Parses a Content-Type header into a value in the following format: (content_type, {parameters})."""
155
151
# Uses email.message.Message to parse the header as described in PEP 594.
@@ -976,29 +972,11 @@ def __init__(
976
972
# Setup marks. These are used to track the state of data received.
977
973
self .marks : dict [str , int ] = {}
978
974
979
- # TODO: Actually use this rather than the dumb version we currently use
980
- # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
981
- # skip = [len(boundary) for x in range(256)]
982
- # for i in range(len(boundary) - 1):
983
- # skip[ord_char(boundary[i])] = len(boundary) - i - 1
984
- #
985
- # # We use a tuple since it's a constant, and marginally faster.
986
- # self.skip = tuple(skip)
987
-
988
975
# Save our boundary.
989
976
if isinstance (boundary , str ): # pragma: no cover
990
977
boundary = boundary .encode ("latin-1" )
991
978
self .boundary = b"\r \n --" + boundary
992
979
993
- # Get a set of characters that belong to our boundary.
994
- self .boundary_chars = frozenset (self .boundary )
995
-
996
- # We also create a lookbehind list.
997
- # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
998
- # "--\r\n" at the final boundary, and the length of '\r\n--' and
999
- # '--\r\n' is 8 bytes.
1000
- self .lookbehind = [NULL for _ in range (len (boundary ) + 8 )]
1001
-
1002
980
def write (self , data : bytes ) -> int :
1003
981
"""Write some data to the parser, which will perform size verification,
1004
982
and then parse the data into the appropriate location (e.g. header,
@@ -1061,21 +1039,43 @@ def delete_mark(name: str, reset: bool = False) -> None:
1061
1039
# end of the buffer, and reset the mark, instead of deleting it. This
1062
1040
# is used at the end of the function to call our callbacks with any
1063
1041
# remaining data in this chunk.
1064
- def data_callback (name : str , remaining : bool = False ) -> None :
1042
+ def data_callback (name : str , end_i : int , remaining : bool = False ) -> None :
1065
1043
marked_index = self .marks .get (name )
1066
1044
if marked_index is None :
1067
1045
return
1068
1046
1069
- # If we're getting remaining data, we ignore the current i value
1070
- # and just call with the remaining data.
1071
- if remaining :
1072
- self .callback (name , data , marked_index , length )
1073
- self .marks [name ] = 0
1074
-
1075
1047
# Otherwise, we call it from the mark to the current byte we're
1076
1048
# processing.
1049
+ if end_i <= marked_index :
1050
+ # There is no additional data to send.
1051
+ pass
1052
+ elif marked_index >= 0 :
1053
+ # We are emitting data from the local buffer.
1054
+ self .callback (name , data , marked_index , end_i )
1055
+ else :
1056
+ # Some of the data comes from a partial boundary match.
1057
+ # and requires look-behind.
1058
+ # We need to use self.flags (and not flags) because we care about
1059
+ # the state when we entered the loop.
1060
+ lookbehind_len = - marked_index
1061
+ if lookbehind_len <= len (boundary ):
1062
+ self .callback (name , boundary , 0 , lookbehind_len )
1063
+ elif self .flags & FLAG_PART_BOUNDARY :
1064
+ lookback = boundary + b"\r \n "
1065
+ self .callback (name , lookback , 0 , lookbehind_len )
1066
+ elif self .flags & FLAG_LAST_BOUNDARY :
1067
+ lookback = boundary + b"--\r \n "
1068
+ self .callback (name , lookback , 0 , lookbehind_len )
1069
+ else : # pragma: no cover (error case)
1070
+ self .logger .warning ("Look-back buffer error" )
1071
+
1072
+ if end_i > 0 :
1073
+ self .callback (name , data , 0 , end_i )
1074
+ # If we're getting remaining data, we have got all the data we
1075
+ # can be certain is not a boundary, leaving only a partial boundary match.
1076
+ if remaining :
1077
+ self .marks [name ] = end_i - length
1077
1078
else :
1078
- self .callback (name , data , marked_index , i )
1079
1079
self .marks .pop (name , None )
1080
1080
1081
1081
# For each byte...
@@ -1183,7 +1183,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
1183
1183
raise e
1184
1184
1185
1185
# Call our callback with the header field.
1186
- data_callback ("header_field" )
1186
+ data_callback ("header_field" , i )
1187
1187
1188
1188
# Move to parsing the header value.
1189
1189
state = MultipartState .HEADER_VALUE_START
@@ -1212,7 +1212,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
1212
1212
# If we've got a CR, we're nearly done our headers. Otherwise,
1213
1213
# we do nothing and just move past this character.
1214
1214
if c == CR :
1215
- data_callback ("header_value" )
1215
+ data_callback ("header_value" , i )
1216
1216
self .callback ("header_end" )
1217
1217
state = MultipartState .HEADER_VALUE_ALMOST_DONE
1218
1218
@@ -1256,46 +1256,46 @@ def data_callback(name: str, remaining: bool = False) -> None:
1256
1256
# We're processing our part data right now. During this, we
1257
1257
# need to efficiently search for our boundary, since any data
1258
1258
# on any number of lines can be a part of the current data.
1259
- # We use the Boyer-Moore-Horspool algorithm to efficiently
1260
- # search through the remainder of the buffer looking for our
1261
- # boundary.
1262
1259
1263
1260
# Save the current value of our index. We use this in case we
1264
1261
# find part of a boundary, but it doesn't match fully.
1265
1262
prev_index = index
1266
1263
1267
1264
# Set up variables.
1268
1265
boundary_length = len (boundary )
1269
- boundary_end = boundary_length - 1
1270
1266
data_length = length
1271
- boundary_chars = self .boundary_chars
1272
1267
1273
1268
# If our index is 0, we're starting a new part, so start our
1274
1269
# search.
1275
1270
if index == 0 :
1276
- # Search forward until we either hit the end of our buffer,
1277
- # or reach a character that's in our boundary.
1278
- i += boundary_end
1279
- while i < data_length - 1 and data [i ] not in boundary_chars :
1280
- i += boundary_length
1281
-
1282
- # Reset i back the length of our boundary, which is the
1283
- # earliest possible location that could be our match (i.e.
1284
- # if we've just broken out of our loop since we saw the
1285
- # last character in our boundary)
1286
- i -= boundary_end
1271
+ # The most common case is likely to be that the whole
1272
+ # boundary is present in the buffer.
1273
+ # Calling `find` is much faster than iterating here.
1274
+ i0 = data .find (boundary , i , data_length )
1275
+ if i0 >= 0 :
1276
+ # We matched the whole boundary string.
1277
+ index = boundary_length - 1
1278
+ i = i0 + boundary_length - 1
1279
+ else :
1280
+ # No match found for whole string.
1281
+ # There may be a partial boundary at the end of the
1282
+ # data, which the find will not match.
1283
+ # Since the length should to be searched is limited to
1284
+ # the boundary length, just perform a naive search.
1285
+ i = max (i , data_length - boundary_length )
1286
+
1287
+ # Search forward until we either hit the end of our buffer,
1288
+ # or reach a potential start of the boundary.
1289
+ while i < data_length - 1 and data [i ] != boundary [0 ]:
1290
+ i += 1
1291
+
1287
1292
c = data [i ]
1288
1293
1289
1294
# Now, we have a couple of cases here. If our index is before
1290
1295
# the end of the boundary...
1291
1296
if index < boundary_length :
1292
1297
# If the character matches...
1293
1298
if boundary [index ] == c :
1294
- # If we found a match for our boundary, we send the
1295
- # existing data.
1296
- if index == 0 :
1297
- data_callback ("part_data" )
1298
-
1299
1299
# The current character matches, so continue!
1300
1300
index += 1
1301
1301
else :
@@ -1332,6 +1332,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
1332
1332
# Unset the part boundary flag.
1333
1333
flags &= ~ FLAG_PART_BOUNDARY
1334
1334
1335
+ # We have identified a boundary, callback for any data before it.
1336
+ data_callback ("part_data" , i - index )
1335
1337
# Callback indicating that we've reached the end of
1336
1338
# a part, and are starting a new one.
1337
1339
self .callback ("part_end" )
@@ -1353,6 +1355,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
1353
1355
elif flags & FLAG_LAST_BOUNDARY :
1354
1356
# We need a second hyphen here.
1355
1357
if c == HYPHEN :
1358
+ # We have identified a boundary, callback for any data before it.
1359
+ data_callback ("part_data" , i - index )
1356
1360
# Callback to end the current part, and then the
1357
1361
# message.
1358
1362
self .callback ("part_end" )
@@ -1362,26 +1366,14 @@ def data_callback(name: str, remaining: bool = False) -> None:
1362
1366
# No match, so reset index.
1363
1367
index = 0
1364
1368
1365
- # If we have an index, we need to keep this byte for later, in
1366
- # case we can't match the full boundary.
1367
- if index > 0 :
1368
- self .lookbehind [index - 1 ] = c
1369
-
1370
1369
# Otherwise, our index is 0. If the previous index is not, it
1371
1370
# means we reset something, and we need to take the data we
1372
1371
# thought was part of our boundary and send it along as actual
1373
1372
# data.
1374
- elif prev_index > 0 :
1375
- # Callback to write the saved data.
1376
- lb_data = join_bytes (self .lookbehind )
1377
- self .callback ("part_data" , lb_data , 0 , prev_index )
1378
-
1373
+ if index == 0 and prev_index > 0 :
1379
1374
# Overwrite our previous index.
1380
1375
prev_index = 0
1381
1376
1382
- # Re-set our mark for part data.
1383
- set_mark ("part_data" )
1384
-
1385
1377
# Re-consider the current character, since this could be
1386
1378
# the start of the boundary itself.
1387
1379
i -= 1
@@ -1410,9 +1402,9 @@ def data_callback(name: str, remaining: bool = False) -> None:
1410
1402
# that we haven't yet reached the end of this 'thing'. So, by setting
1411
1403
# the mark to 0, we cause any data callbacks that take place in future
1412
1404
# calls to this function to start from the beginning of that buffer.
1413
- data_callback ("header_field" , True )
1414
- data_callback ("header_value" , True )
1415
- data_callback ("part_data" , True )
1405
+ data_callback ("header_field" , length , True )
1406
+ data_callback ("header_value" , length , True )
1407
+ data_callback ("part_data" , length - index , True )
1416
1408
1417
1409
# Save values to locals.
1418
1410
self .state = state
0 commit comments