From 70f6e43d0efa8614bef5262f189799c0e5f5fefe Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 3 Apr 2025 18:22:54 -0400 Subject: [PATCH 1/4] Fixes PYTHON-5126 and adopts agreement that ignored bits raise if non-zero --- bson/binary.py | 15 ++++++++++++ test/bson_binary_vector/packed_bit.json | 25 +++++++++++++------- test/test_bson_binary_vector.py | 31 +++++++++++++++++-------- 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 6698e55ccc..78d26094dc 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -462,6 +462,10 @@ def from_vector( raise ValueError(f"{padding=}. It must be in [0,1, ..7].") if padding and not vector: raise ValueError("Empty vector with non-zero padding.") + if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: + raise ValueError( + "If padding p is provided, all bits in the final byte lower than p must be 0." + ) elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" if padding: @@ -490,6 +494,11 @@ def as_vector(self) -> BinaryVector: dtype = BinaryVectorDtype(dtype) n_values = len(self) - position + if padding and dtype != BinaryVectorDtype.PACKED_BIT: + raise ValueError( + f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})" + ) + if dtype == BinaryVectorDtype.INT8: dtype_format = "b" format_string = f"<{n_values}{dtype_format}" @@ -513,6 +522,12 @@ def as_vector(self) -> BinaryVector: dtype_format = "B" format_string = f"<{n_values}{dtype_format}" unpacked_uint8s = list(struct.unpack_from(format_string, self, position)) + if padding and not n_values: + raise ValueError("Corrupt data. Vector has a padding P, but no data.") + if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0: + raise ValueError( + "Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero." + ) return BinaryVector(unpacked_uint8s, dtype, padding) else: diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json index a220e7e318..8823095632 100644 --- a/test/bson_binary_vector/packed_bit.json +++ b/test/bson_binary_vector/packed_bit.json @@ -14,30 +14,39 @@ { "description": "Simple Vector PACKED_BIT", "valid": true, - "vector": [127, 7], + "vector": [127, 8], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000910007F0700" + "canonical_bson": "1600000005766563746F7200040000000910007F0800" }, { - "description": "Empty Vector PACKED_BIT", + "description": "PACKED_BIT with padding", "valid": true, - "vector": [], + "vector": [127, 8], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", - "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009100000" + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0800" }, { - "description": "PACKED_BIT with padding", - "valid": true, + "description": "PACKED_BIT with inconsistent padding", + "valid": false, "vector": [127, 7], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 3, "canonical_bson": "1600000005766563746F7200040000000910037F0700" }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000" + }, { "description": "Overflow Vector PACKED_BIT", "valid": false, diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 9bfdcbfb9a..afe01f42bf 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -48,11 +48,11 @@ def create_test(case_spec): def run_test(self): for test_case in case_spec.get("tests", []): description = test_case["description"] - vector_exp = test_case.get("vector", []) + vector_exp = test_case.get("vector", None) dtype_hex_exp = test_case["dtype_hex"] dtype_alias_exp = test_case.get("dtype_alias") padding_exp = test_case.get("padding", 0) - canonical_bson_exp = test_case.get("canonical_bson") + canonical_bson_exp = test_case.get("canonical_bson", None) # Convert dtype hex string into bytes dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) @@ -85,14 +85,25 @@ def run_test(self): self.assertEqual(cB_obs, canonical_bson_exp, description) else: - with self.assertRaises((struct.error, ValueError), msg=description): - # Tests Binary.from_vector - Binary.from_vector(vector_exp, dtype_exp, padding_exp) - # Tests Binary.as_vector - cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) - decoded_doc = decode(cB_exp) - binary_obs = decoded_doc[test_key] - binary_obs.as_vector() + """ + #### To prove correct in an invalid case (`valid:false`), one MUST + - if the vector field is present, raise an exception when attempting to encode a document from the numeric values, + dtype, and padding. + - if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding + numeric values, as the field contains corrupted data. + """ + # Tests Binary.from_vector() + if vector_exp is not None: + with self.assertRaises((struct.error, ValueError), msg=description): + Binary.from_vector(vector_exp, dtype_exp, padding_exp) + + # Tests Binary.as_vector() + if canonical_bson_exp is not None: + with self.assertRaises((struct.error, ValueError), msg=description): + cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) + decoded_doc = decode(cB_exp) + binary_obs = decoded_doc[test_key] + binary_obs.as_vector() return run_test From 8b21176184e00c1158e4ab12967a8e1eec345f51 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 3 Apr 2025 18:40:33 -0400 Subject: [PATCH 2/4] Fixes typing error --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index 78d26094dc..48f1f58512 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -462,7 +462,7 @@ def from_vector( raise ValueError(f"{padding=}. It must be in [0,1, ..7].") if padding and not vector: raise ValueError("Empty vector with non-zero padding.") - if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: + if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: # type: ignore raise ValueError( "If padding p is provided, all bits in the final byte lower than p must be 0." ) From 9c5a4ae6ce3bd9380431bc9d6d05a77e42c45ec8 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 3 Apr 2025 18:56:35 -0400 Subject: [PATCH 3/4] Update example data TestBSON.test_vector as previous example now raises an exception. --- test/test_bson.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_bson.py b/test/test_bson.py index 1616c513c2..522945d5f4 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -739,7 +739,7 @@ def test_vector(self): """Tests of subtype 9""" # We start with valid cases, across the 3 dtypes implemented. # Work with a simple vector that can be interpreted as int8, float32, or ubyte - list_vector = [127, 7] + list_vector = [127, 8] # As INT8, vector has length 2 binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8) vector = binary_vector.as_vector() @@ -764,18 +764,18 @@ def test_vector(self): uncompressed = "" for val in list_vector: uncompressed += format(val, "08b") - assert uncompressed[:-padding] == "0111111100000" + assert uncompressed[:-padding] == "0111111100001" # It is worthwhile explicitly showing the values encoded to BSON padded_doc = {"padded_vec": padded_vec} assert ( encode(padded_doc) - == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00" + == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x08\x00" ) # and dumped to json assert ( json_util.dumps(padded_doc) - == '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}' + == '{"padded_vec": {"$binary": {"base64": "EAN/CA==", "subType": "09"}}}' ) # FLOAT32 is also implemented From abf60760a4c8b92d47bb6665db7fa760becf853d Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 8 Apr 2025 09:01:55 -0400 Subject: [PATCH 4/4] Resync of bson-binary-vector test after suggestion on specs PR --- test/bson_binary_vector/packed_bit.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json index 8823095632..3015acba66 100644 --- a/test/bson_binary_vector/packed_bit.json +++ b/test/bson_binary_vector/packed_bit.json @@ -14,11 +14,11 @@ { "description": "Simple Vector PACKED_BIT", "valid": true, - "vector": [127, 8], + "vector": [127, 7], "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000910007F0800" + "canonical_bson": "1600000005766563746F7200040000000910007F0700" }, { "description": "PACKED_BIT with padding",