Skip to content

Commit bfce4eb

Browse files
committed
Restore Spark compatibility
1 parent 11d6c6a commit bfce4eb

File tree

5 files changed

+43
-25
lines changed

5 files changed

+43
-25
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.7.1] - 2024-01-15
8+
### Changed
9+
- Reverted to use `int32` and `int64` for `op` and `offset` respectively to preserve compatibility with Spark engine until Spark's Parquet version is upgraded.
10+
711
## [0.7.0] - 2024-01-10
812
### Changed
913
- Migration to ODF changelog schema

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "kamu-engine-datafusion"
3-
version = "0.7.0"
3+
version = "0.7.1"
44
authors = ["Kamu Data Inc. <[email protected]>"]
55
license-file = "LICENSE.txt"
66
edition = "2021"

src/engine.rs

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,10 @@ impl Engine {
127127

128128
// Get result's execution plan
129129
let df = ctx.table(Self::OUTPUT_VIEW_NAME).await.int_err()?;
130+
tracing::info!(schema = ?df.schema(), "Raw result schema");
130131

131132
let df = Self::normalize_raw_result(df, &request.vocab)?;
133+
tracing::info!(schema = ?df.schema(), "Normalized result schema");
132134

133135
Self::validate_raw_result(&df, &request.vocab)?;
134136

@@ -321,11 +323,17 @@ impl Engine {
321323
)
322324
.alias(field.name())
323325
}
324-
// For compatibility with engines that cannot write correct Parquet logical types we
325-
// allow plain INT32, but cast it to UINT8 here
326-
DataType::Int32 if *field.name() == vocab.operation_type_column => {
326+
// TODO: Normalize towards UInt8 after Spark is updated
327+
// See: https://github.com/kamu-data/kamu-cli/issues/445
328+
DataType::Int8
329+
| DataType::UInt8
330+
| DataType::Int16
331+
| DataType::UInt16
332+
| DataType::UInt32
333+
if *field.name() == vocab.operation_type_column =>
334+
{
327335
noop = false;
328-
cast(col(field.unqualified_column()), DataType::UInt8).alias(field.name())
336+
cast(col(field.unqualified_column()), DataType::Int32).alias(field.name())
329337
}
330338
_ => col(field.unqualified_column()),
331339
};
@@ -343,8 +351,6 @@ impl Engine {
343351
df: &DataFrame,
344352
vocab: &DatasetVocabulary,
345353
) -> Result<(), ExecuteTransformError> {
346-
tracing::info!(schema = ?df.schema(), "Computed raw result schema");
347-
348354
let system_columns = [&vocab.offset_column, &vocab.system_time_column];
349355
for system_column in system_columns {
350356
if df.schema().has_column_with_unqualified_name(system_column) {
@@ -366,11 +372,13 @@ impl Engine {
366372
.first()
367373
{
368374
match op_col.data_type() {
369-
DataType::UInt8 => {}
375+
// TODO: Require UInt8 after Spark is updated
376+
// See: https://github.com/kamu-data/kamu-cli/issues/445
377+
DataType::Int32 => {}
370378
typ => {
371379
return Err(TransformResponseInvalidQuery {
372380
message: format!(
373-
"Operation type column '{}' should be UInt8, but found: {}",
381+
"Operation type column '{}' should be Int32, but found: {}",
374382
vocab.operation_type_column, typ
375383
),
376384
}
@@ -475,11 +483,13 @@ impl Engine {
475483
}),
476484
)?;
477485

486+
// TODO: Cast to UInt64 after Spark is updated
487+
// See: https://github.com/kamu-data/kamu-cli/issues/445
478488
let df = df.with_column(
479489
&vocab.offset_column,
480490
cast(
481491
col(&vocab.offset_column as &str) + lit(start_offset as i64 - 1),
482-
DataType::UInt64,
492+
DataType::Int64,
483493
),
484494
)?;
485495

@@ -490,7 +500,9 @@ impl Engine {
490500
{
491501
df.with_column(
492502
&vocab.operation_type_column,
493-
lit(OperationType::Append as u8),
503+
// TODO: Cast to u8 after Spark is updated
504+
// See: https://github.com/kamu-data/kamu-cli/issues/445
505+
lit(OperationType::Append as i32),
494506
)?
495507
} else {
496508
df
@@ -572,6 +584,6 @@ impl Engine {
572584
tracing::info!("Produced empty result",);
573585
let _ = std::fs::remove_file(path);
574586
}
575-
Ok(num_records)
587+
Ok(num_records as u64)
576588
}
577589
}

tests/tests/test_transform.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,17 @@ fn write_sample_data(path: impl AsRef<Path>, data: &[Record]) {
5353
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
5454
use datafusion::arrow::record_batch::RecordBatch;
5555

56+
// TODO: Replace with UInt64 and UInt8 after Spark is updated
57+
// See: https://github.com/kamu-data/kamu-cli/issues/445
5658
let schema = Arc::new(Schema::new(vec![
5759
Field::new(
5860
DatasetVocabulary::DEFAULT_OFFSET_COLUMN_NAME,
59-
DataType::UInt64,
61+
DataType::Int64,
6062
false,
6163
),
6264
Field::new(
6365
DatasetVocabulary::DEFAULT_OPERATION_TYPE_COLUMN_NAME,
64-
DataType::UInt8,
66+
DataType::Int32,
6567
false,
6668
),
6769
Field::new(
@@ -81,11 +83,11 @@ fn write_sample_data(path: impl AsRef<Path>, data: &[Record]) {
8183
let record_batch = RecordBatch::try_new(
8284
schema,
8385
vec![
84-
Arc::new(array::UInt64Array::from(
85-
data.iter().map(|r| r.offset).collect::<Vec<_>>(),
86+
Arc::new(array::Int64Array::from(
87+
data.iter().map(|r| r.offset as i64).collect::<Vec<_>>(),
8688
)),
87-
Arc::new(array::UInt8Array::from(
88-
data.iter().map(|r| r.op as u8).collect::<Vec<_>>(),
89+
Arc::new(array::Int32Array::from(
90+
data.iter().map(|r| r.op as i32).collect::<Vec<_>>(),
8991
)),
9092
Arc::new(
9193
array::TimestampMillisecondArray::from(
@@ -315,8 +317,8 @@ async fn test_result_schema() {
315317
expected_schema: Some(indoc!(
316318
r#"
317319
message arrow_schema {
318-
OPTIONAL INT64 offset (INTEGER(64,false));
319-
REQUIRED INT32 op (INTEGER(8,false));
320+
OPTIONAL INT64 offset;
321+
REQUIRED INT32 op;
320322
REQUIRED INT64 system_time (TIMESTAMP(MILLIS,true));
321323
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
322324
REQUIRED BYTE_ARRAY city (STRING);
@@ -337,8 +339,8 @@ async fn test_result_optimal_parquet_encoding() {
337339
expected_schema: Some(indoc!(
338340
r#"
339341
message arrow_schema {
340-
OPTIONAL INT64 offset (INTEGER(64,false));
341-
REQUIRED INT32 op (INTEGER(8,false));
342+
OPTIONAL INT64 offset;
343+
REQUIRED INT32 op;
342344
REQUIRED INT64 system_time (TIMESTAMP(MILLIS,true));
343345
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
344346
REQUIRED BYTE_ARRAY city (STRING);
@@ -667,8 +669,8 @@ async fn test_event_time_coerced_to_millis() {
667669
expected_schema: Some(indoc!(
668670
r#"
669671
message arrow_schema {
670-
OPTIONAL INT64 offset (INTEGER(64,false));
671-
REQUIRED INT32 op (INTEGER(8,false));
672+
OPTIONAL INT64 offset;
673+
REQUIRED INT32 op;
672674
REQUIRED INT64 system_time (TIMESTAMP(MILLIS,true));
673675
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
674676
REQUIRED BYTE_ARRAY city (STRING);

0 commit comments

Comments
 (0)