Skip to content

Commit c0e78d2

Browse files
cj-zhukovSergey Zhukovalamb
authored
Remove use of deprecated dict_id in datafusion-proto (#14173) (#14227)
* Remove use of deprecated dict_id in datafusion-proto (#14173) * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * Fix issues causing GitHub checks to fail * remove accidental file * undo deletion of test in copy.slt * Fix issues causing GitHub checks to fail --------- Co-authored-by: Sergey Zhukov <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent f64097f commit c0e78d2

File tree

9 files changed

+18
-152
lines changed

9 files changed

+18
-152
lines changed

Diff for: datafusion/proto-common/proto/datafusion_common.proto

+1-2
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,7 @@ message Field {
108108
// for complex data types like structs, unions
109109
repeated Field children = 4;
110110
map<string, string> metadata = 5;
111-
int64 dict_id = 6;
112-
bool dict_ordered = 7;
111+
bool dict_ordered = 6;
113112
}
114113

115114
message Timestamp{

Diff for: datafusion/proto-common/src/from_proto/mod.rs

+12-43
Original file line numberDiff line numberDiff line change
@@ -320,21 +320,8 @@ impl TryFrom<&protobuf::Field> for Field {
320320
type Error = Error;
321321
fn try_from(field: &protobuf::Field) -> Result<Self, Self::Error> {
322322
let datatype = field.arrow_type.as_deref().required("arrow_type")?;
323-
let field = if field.dict_id != 0 {
324-
// https://github.com/apache/datafusion/issues/14173
325-
#[allow(deprecated)]
326-
Self::new_dict(
327-
field.name.as_str(),
328-
datatype,
329-
field.nullable,
330-
field.dict_id,
331-
field.dict_ordered,
332-
)
333-
.with_metadata(field.metadata.clone())
334-
} else {
335-
Self::new(field.name.as_str(), datatype, field.nullable)
336-
.with_metadata(field.metadata.clone())
337-
};
323+
let field = Self::new(field.name.as_str(), datatype, field.nullable)
324+
.with_metadata(field.metadata.clone());
338325
Ok(field)
339326
}
340327
}
@@ -436,36 +423,18 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
436423

437424
let id = dict_batch.id();
438425

439-
let fields_using_this_dictionary = {
440-
// See https://github.com/apache/datafusion/issues/14173
441-
#[allow(deprecated)]
442-
schema.fields_with_dict_id(id)
443-
};
426+
let record_batch = read_record_batch(
427+
&buffer,
428+
dict_batch.data().unwrap(),
429+
Arc::new(schema.clone()),
430+
&Default::default(),
431+
None,
432+
&message.version(),
433+
)?;
444434

445-
let first_field = fields_using_this_dictionary.first().ok_or_else(|| {
446-
Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())
447-
})?;
435+
let values: ArrayRef = Arc::clone(record_batch.column(0));
448436

449-
let values: ArrayRef = match first_field.data_type() {
450-
DataType::Dictionary(_, ref value_type) => {
451-
// Make a fake schema for the dictionary batch.
452-
let value = value_type.as_ref().clone();
453-
let schema = Schema::new(vec![Field::new("", value, true)]);
454-
// Read a single column
455-
let record_batch = read_record_batch(
456-
&buffer,
457-
dict_batch.data().unwrap(),
458-
Arc::new(schema),
459-
&Default::default(),
460-
None,
461-
&message.version(),
462-
)?;
463-
Ok(Arc::clone(record_batch.column(0)))
464-
}
465-
_ => Err(Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())),
466-
}?;
467-
468-
Ok((id,values))
437+
Ok((id, values))
469438
}).collect::<datafusion_common::Result<HashMap<_, _>>>()?;
470439

471440
let record_batch = read_record_batch(

Diff for: datafusion/proto-common/src/generated/pbjson.rs

-23
Original file line numberDiff line numberDiff line change
@@ -3107,9 +3107,6 @@ impl serde::Serialize for Field {
31073107
if !self.metadata.is_empty() {
31083108
len += 1;
31093109
}
3110-
if self.dict_id != 0 {
3111-
len += 1;
3112-
}
31133110
if self.dict_ordered {
31143111
len += 1;
31153112
}
@@ -3129,19 +3126,13 @@ impl serde::Serialize for Field {
31293126
if !self.metadata.is_empty() {
31303127
struct_ser.serialize_field("metadata", &self.metadata)?;
31313128
}
3132-
if self.dict_id != 0 {
3133-
#[allow(clippy::needless_borrow)]
3134-
#[allow(clippy::needless_borrows_for_generic_args)]
3135-
struct_ser.serialize_field("dictId", ToString::to_string(&self.dict_id).as_str())?;
3136-
}
31373129
if self.dict_ordered {
31383130
struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
31393131
}
31403132
struct_ser.end()
31413133
}
31423134
}
31433135
impl<'de> serde::Deserialize<'de> for Field {
3144-
#[allow(deprecated)]
31453136
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
31463137
where
31473138
D: serde::Deserializer<'de>,
@@ -3153,8 +3144,6 @@ impl<'de> serde::Deserialize<'de> for Field {
31533144
"nullable",
31543145
"children",
31553146
"metadata",
3156-
"dict_id",
3157-
"dictId",
31583147
"dict_ordered",
31593148
"dictOrdered",
31603149
];
@@ -3166,7 +3155,6 @@ impl<'de> serde::Deserialize<'de> for Field {
31663155
Nullable,
31673156
Children,
31683157
Metadata,
3169-
DictId,
31703158
DictOrdered,
31713159
}
31723160
impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -3194,7 +3182,6 @@ impl<'de> serde::Deserialize<'de> for Field {
31943182
"nullable" => Ok(GeneratedField::Nullable),
31953183
"children" => Ok(GeneratedField::Children),
31963184
"metadata" => Ok(GeneratedField::Metadata),
3197-
"dictId" | "dict_id" => Ok(GeneratedField::DictId),
31983185
"dictOrdered" | "dict_ordered" => Ok(GeneratedField::DictOrdered),
31993186
_ => Err(serde::de::Error::unknown_field(value, FIELDS)),
32003187
}
@@ -3220,7 +3207,6 @@ impl<'de> serde::Deserialize<'de> for Field {
32203207
let mut nullable__ = None;
32213208
let mut children__ = None;
32223209
let mut metadata__ = None;
3223-
let mut dict_id__ = None;
32243210
let mut dict_ordered__ = None;
32253211
while let Some(k) = map_.next_key()? {
32263212
match k {
@@ -3256,14 +3242,6 @@ impl<'de> serde::Deserialize<'de> for Field {
32563242
map_.next_value::<std::collections::HashMap<_, _>>()?
32573243
);
32583244
}
3259-
GeneratedField::DictId => {
3260-
if dict_id__.is_some() {
3261-
return Err(serde::de::Error::duplicate_field("dictId"));
3262-
}
3263-
dict_id__ =
3264-
Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
3265-
;
3266-
}
32673245
GeneratedField::DictOrdered => {
32683246
if dict_ordered__.is_some() {
32693247
return Err(serde::de::Error::duplicate_field("dictOrdered"));
@@ -3278,7 +3256,6 @@ impl<'de> serde::Deserialize<'de> for Field {
32783256
nullable: nullable__.unwrap_or_default(),
32793257
children: children__.unwrap_or_default(),
32803258
metadata: metadata__.unwrap_or_default(),
3281-
dict_id: dict_id__.unwrap_or_default(),
32823259
dict_ordered: dict_ordered__.unwrap_or_default(),
32833260
})
32843261
}

Diff for: datafusion/proto-common/src/generated/prost.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,7 @@ pub struct Field {
106106
::prost::alloc::string::String,
107107
::prost::alloc::string::String,
108108
>,
109-
#[prost(int64, tag = "6")]
110-
pub dict_id: i64,
111-
#[prost(bool, tag = "7")]
109+
#[prost(bool, tag = "6")]
112110
pub dict_ordered: bool,
113111
}
114112
#[derive(Clone, PartialEq, ::prost::Message)]

Diff for: datafusion/proto-common/src/to_proto/mod.rs

-3
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ impl TryFrom<&Field> for protobuf::Field {
9797
nullable: field.is_nullable(),
9898
children: Vec::new(),
9999
metadata: field.metadata().clone(),
100-
#[allow(deprecated)]
101-
// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
102-
dict_id: field.dict_id().unwrap_or(0),
103100
dict_ordered: field.dict_is_ordered().unwrap_or(false),
104101
})
105102
}

Diff for: datafusion/proto/src/generated/datafusion_proto_common.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,7 @@ pub struct Field {
106106
::prost::alloc::string::String,
107107
::prost::alloc::string::String,
108108
>,
109-
#[prost(int64, tag = "6")]
110-
pub dict_id: i64,
111-
#[prost(bool, tag = "7")]
109+
#[prost(bool, tag = "6")]
112110
pub dict_ordered: bool,
113111
}
114112
#[derive(Clone, PartialEq, ::prost::Message)]

Diff for: datafusion/proto/tests/cases/roundtrip_logical_plan.rs

-72
Original file line numberDiff line numberDiff line change
@@ -1494,20 +1494,6 @@ fn round_trip_scalar_values_and_data_types() {
14941494
Field::new("b", DataType::Boolean, false),
14951495
ScalarValue::from(false),
14961496
)
1497-
.with_scalar(
1498-
Field::new(
1499-
"c",
1500-
DataType::Dictionary(
1501-
Box::new(DataType::UInt16),
1502-
Box::new(DataType::Utf8),
1503-
),
1504-
false,
1505-
),
1506-
ScalarValue::Dictionary(
1507-
Box::new(DataType::UInt16),
1508-
Box::new("value".into()),
1509-
),
1510-
)
15111497
.build()
15121498
.unwrap(),
15131499
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
@@ -1518,25 +1504,6 @@ fn round_trip_scalar_values_and_data_types() {
15181504
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
15191505
Field::new("a", DataType::Int32, true),
15201506
Field::new("b", DataType::Boolean, false),
1521-
Field::new(
1522-
"c",
1523-
DataType::Dictionary(
1524-
Box::new(DataType::UInt16),
1525-
Box::new(DataType::Binary),
1526-
),
1527-
false,
1528-
),
1529-
Field::new(
1530-
"d",
1531-
DataType::new_list(
1532-
DataType::Dictionary(
1533-
Box::new(DataType::UInt16),
1534-
Box::new(DataType::Binary),
1535-
),
1536-
false,
1537-
),
1538-
false,
1539-
),
15401507
])))
15411508
.unwrap(),
15421509
ScalarValue::try_from(&DataType::Map(
@@ -1815,45 +1782,6 @@ fn round_trip_datatype() {
18151782
}
18161783
}
18171784

1818-
// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
1819-
#[allow(deprecated)]
1820-
#[test]
1821-
fn roundtrip_dict_id() -> Result<()> {
1822-
let dict_id = 42;
1823-
let field = Field::new(
1824-
"keys",
1825-
DataType::List(Arc::new(Field::new_dict(
1826-
"item",
1827-
DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
1828-
true,
1829-
dict_id,
1830-
false,
1831-
))),
1832-
false,
1833-
);
1834-
let schema = Arc::new(Schema::new(vec![field]));
1835-
1836-
// encode
1837-
let mut buf: Vec<u8> = vec![];
1838-
let schema_proto: protobuf::Schema = schema.try_into().unwrap();
1839-
schema_proto.encode(&mut buf).unwrap();
1840-
1841-
// decode
1842-
let schema_proto = protobuf::Schema::decode(buf.as_slice()).unwrap();
1843-
let decoded: Schema = (&schema_proto).try_into()?;
1844-
1845-
// assert
1846-
let keys = decoded.fields().iter().last().unwrap();
1847-
match keys.data_type() {
1848-
DataType::List(field) => {
1849-
assert_eq!(field.dict_id(), Some(dict_id), "dict_id should be retained");
1850-
}
1851-
_ => panic!("Invalid type"),
1852-
}
1853-
1854-
Ok(())
1855-
}
1856-
18571785
#[test]
18581786
fn roundtrip_null_scalar_values() {
18591787
let test_types = vec![

Diff for: datafusion/sqllogictest/test_files/copy.slt

+1-1
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,6 @@ select * from validate_arrow_file_dict;
558558
c foo
559559
d bar
560560

561-
562561
# Copy from table to folder of json
563562
query I
564563
COPY source_table to 'test_files/scratch/copy/table_arrow' STORED AS ARROW;
@@ -632,3 +631,4 @@ COPY source_table to '/tmp/table.parquet' (row_group_size 55 + 102);
632631
# Copy using execution.keep_partition_by_columns with an invalid value
633632
query error DataFusion error: Invalid or Unsupported Configuration: provided value for 'execution.keep_partition_by_columns' was not recognized: "invalid_value"
634633
COPY source_table to '/tmp/table.parquet' OPTIONS (execution.keep_partition_by_columns invalid_value);
634+

Diff for: datafusion/sqllogictest/test_files/regexp.slt

+2-2
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,8 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
477477
true
478478

479479
statement error
480-
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata
481-
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
480+
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata
481+
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata: {} })
482482
select [1,2] ~ [3];
483483

484484
query B

0 commit comments

Comments
 (0)