diff --git a/bson/binary.py b/bson/binary.py index 6698e55ccc..48f1f58512 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -462,6 +462,10 @@ def from_vector( raise ValueError(f"{padding=}. It must be in [0,1, ..7].") if padding and not vector: raise ValueError("Empty vector with non-zero padding.") + if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: # type: ignore + raise ValueError( + "If padding p is provided, all bits in the final byte lower than p must be 0." + ) elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" if padding: @@ -490,6 +494,11 @@ def as_vector(self) -> BinaryVector: dtype = BinaryVectorDtype(dtype) n_values = len(self) - position + if padding and dtype != BinaryVectorDtype.PACKED_BIT: + raise ValueError( + f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})" + ) + if dtype == BinaryVectorDtype.INT8: dtype_format = "b" format_string = f"<{n_values}{dtype_format}" @@ -513,6 +522,12 @@ def as_vector(self) -> BinaryVector: dtype_format = "B" format_string = f"<{n_values}{dtype_format}" unpacked_uint8s = list(struct.unpack_from(format_string, self, position)) + if padding and not n_values: + raise ValueError("Corrupt data. Vector has a padding P, but no data.") + if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0: + raise ValueError( + "Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero." + ) return BinaryVector(unpacked_uint8s, dtype, padding) else: diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json index a220e7e318..3015acba66 100644 --- a/test/bson_binary_vector/packed_bit.json +++ b/test/bson_binary_vector/packed_bit.json @@ -21,23 +21,32 @@ "canonical_bson": "1600000005766563746F7200040000000910007F0700" }, { - "description": "Empty Vector PACKED_BIT", + "description": "PACKED_BIT with padding", "valid": true, - "vector": [], + "vector": [127, 8], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", - "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009100000" + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0800" }, { - "description": "PACKED_BIT with padding", - "valid": true, + "description": "PACKED_BIT with inconsistent padding", + "valid": false, "vector": [127, 7], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 3, "canonical_bson": "1600000005766563746F7200040000000910037F0700" }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000" + }, { "description": "Overflow Vector PACKED_BIT", "valid": false, diff --git a/test/test_bson.py b/test/test_bson.py index 1616c513c2..522945d5f4 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -739,7 +739,7 @@ def test_vector(self): """Tests of subtype 9""" # We start with valid cases, across the 3 dtypes implemented. # Work with a simple vector that can be interpreted as int8, float32, or ubyte - list_vector = [127, 7] + list_vector = [127, 8] # As INT8, vector has length 2 binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8) vector = binary_vector.as_vector() @@ -764,18 +764,18 @@ def test_vector(self): uncompressed = "" for val in list_vector: uncompressed += format(val, "08b") - assert uncompressed[:-padding] == "0111111100000" + assert uncompressed[:-padding] == "0111111100001" # It is worthwhile explicitly showing the values encoded to BSON padded_doc = {"padded_vec": padded_vec} assert ( encode(padded_doc) - == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00" + == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x08\x00" ) # and dumped to json assert ( json_util.dumps(padded_doc) - == '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}' + == '{"padded_vec": {"$binary": {"base64": "EAN/CA==", "subType": "09"}}}' ) # FLOAT32 is also implemented diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 9bfdcbfb9a..afe01f42bf 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -48,11 +48,11 @@ def create_test(case_spec): def run_test(self): for test_case in case_spec.get("tests", []): description = test_case["description"] - vector_exp = test_case.get("vector", []) + vector_exp = test_case.get("vector", None) dtype_hex_exp = test_case["dtype_hex"] dtype_alias_exp = test_case.get("dtype_alias") padding_exp = test_case.get("padding", 0) - canonical_bson_exp = test_case.get("canonical_bson") + canonical_bson_exp = test_case.get("canonical_bson", None) # Convert dtype hex string into bytes dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) @@ -85,14 +85,25 @@ def run_test(self): self.assertEqual(cB_obs, canonical_bson_exp, description) else: - with self.assertRaises((struct.error, ValueError), msg=description): - # Tests Binary.from_vector - Binary.from_vector(vector_exp, dtype_exp, padding_exp) - # Tests Binary.as_vector - cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) - decoded_doc = decode(cB_exp) - binary_obs = decoded_doc[test_key] - binary_obs.as_vector() + """ + #### To prove correct in an invalid case (`valid:false`), one MUST + - if the vector field is present, raise an exception when attempting to encode a document from the numeric values, + dtype, and padding. + - if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding + numeric values, as the field contains corrupted data. + """ + # Tests Binary.from_vector() + if vector_exp is not None: + with self.assertRaises((struct.error, ValueError), msg=description): + Binary.from_vector(vector_exp, dtype_exp, padding_exp) + + # Tests Binary.as_vector() + if canonical_bson_exp is not None: + with self.assertRaises((struct.error, ValueError), msg=description): + cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) + decoded_doc = decode(cB_exp) + binary_obs = decoded_doc[test_key] + binary_obs.as_vector() return run_test