Skip to content

Commit a962522

Browse files
authored
Support both 0x01 and 0x02 as type for list of booleans in thrift metadata (apache#7052)
* Support both 0x01 and 0x02 as type for list of booleans * Also support 0 for false inside boolean collections * Use hex notation in tests
1 parent 8b2b1ef commit a962522

File tree

1 file changed

+60
-2
lines changed

1 file changed

+60
-2
lines changed

parquet/src/thrift.rs

+60-2
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,13 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> {
193193
Some(b) => Ok(b),
194194
None => {
195195
let b = self.read_byte()?;
196+
// Previous versions of the thrift specification said to use 0 and 1 inside collections,
197+
// but that differed from existing implementations.
198+
// The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
199+
// At least the go implementation seems to have followed the previously documented values.
196200
match b {
197201
0x01 => Ok(true),
198-
0x02 => Ok(false),
202+
0x00 | 0x02 => Ok(false),
199203
unkn => Err(thrift::Error::Protocol(thrift::ProtocolError {
200204
kind: thrift::ProtocolErrorKind::InvalidData,
201205
message: format!("cannot convert {} into bool", unkn),
@@ -274,7 +278,12 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> {
274278

275279
fn collection_u8_to_type(b: u8) -> thrift::Result<TType> {
276280
match b {
277-
0x01 => Ok(TType::Bool),
281+
// For historical and compatibility reasons, a reader should be capable to deal with both cases.
282+
// The only valid value in the original spec was 2, but due to an widespread implementation bug
283+
// the defacto standard across large parts of the library became 1 instead.
284+
// As a result, both values are now allowed.
285+
// https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
286+
0x01 | 0x02 => Ok(TType::Bool),
278287
o => u8_to_type(o),
279288
}
280289
}
@@ -305,3 +314,52 @@ fn eof_error() -> thrift::Error {
305314
message: "Unexpected EOF".to_string(),
306315
})
307316
}
317+
318+
#[cfg(test)]
319+
mod tests {
320+
use crate::format::{BoundaryOrder, ColumnIndex};
321+
use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
322+
323+
#[test]
324+
pub fn read_boolean_list_field_type() {
325+
// Boolean collection type encoded as 0x01, as used by this crate when writing.
326+
// Values encoded as 1 (true) or 2 (false) as in the current version of the thrift
327+
// documentation.
328+
let bytes = vec![0x19, 0x21, 2, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0];
329+
330+
let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice());
331+
let index = ColumnIndex::read_from_in_protocol(&mut protocol).unwrap();
332+
let expected = ColumnIndex {
333+
null_pages: vec![false, true],
334+
min_values: vec![],
335+
max_values: vec![],
336+
boundary_order: BoundaryOrder::UNORDERED,
337+
null_counts: None,
338+
repetition_level_histograms: None,
339+
definition_level_histograms: None,
340+
};
341+
342+
assert_eq!(&index, &expected);
343+
}
344+
345+
#[test]
346+
pub fn read_boolean_list_alternative_encoding() {
347+
// Boolean collection type encoded as 0x02, as allowed by the spec.
348+
// Values encoded as 1 (true) or 0 (false) as before the thrift documentation change on 2024-12-13.
349+
let bytes = vec![0x19, 0x22, 0, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0];
350+
351+
let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice());
352+
let index = ColumnIndex::read_from_in_protocol(&mut protocol).unwrap();
353+
let expected = ColumnIndex {
354+
null_pages: vec![false, true],
355+
min_values: vec![],
356+
max_values: vec![],
357+
boundary_order: BoundaryOrder::UNORDERED,
358+
null_counts: None,
359+
repetition_level_histograms: None,
360+
definition_level_histograms: None,
361+
};
362+
363+
assert_eq!(&index, &expected);
364+
}
365+
}

0 commit comments

Comments
 (0)