Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 8f269c5

Browse files
authored
feat(df-repr/bridge): upgrade datafusion to 43.0.0 (#260)
Despite the upgrade, * New `create_df_context` to be used across all crates to create a datafusion context with optd. We had too much duplicate code before to set up the context. * The main refactor is about the aggregation expressions. Datafusion has a new way of doing that. * Datafusion removed cross join. We didn't. We can eventually remove it but now it's blocked on two-stage cascades: if we simply treat cross join the same as inner join, we would time out. * Several other refactors to adapt to datafusion (i.e., limit node now takes i64, empty relation / placeholder row executor) * Keep as much as the original datafusion cli crate as possible. We now only patch main.rs and exec.rs. * There's one more breaking change that we might encounter later when doing sort physical properties. Now datafusion logical plan will remove duplicate sorts if there are no limits present. I feel this is a bad move b/c it's not a direct mapping from the original SQL statement... --------- Signed-off-by: Alex Chi <[email protected]>
1 parent 6696706 commit 8f269c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+12894
-14378
lines changed

Cargo.lock

+969-670
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-optd-cli/Cargo.lock

-3,937
This file was deleted.

datafusion-optd-cli/Cargo.toml

+26-17
Original file line numberDiff line numberDiff line change
@@ -18,52 +18,61 @@
1818
[package]
1919
name = "datafusion-optd-cli"
2020
description = "Command Line Client for DataFusion query engine."
21-
version = "32.0.0"
21+
version = "43.0.0"
22+
authors = ["Apache DataFusion <[email protected]>"]
2223
edition = "2021"
2324
keywords = ["arrow", "datafusion", "query", "sql"]
2425
license = "Apache-2.0"
2526
homepage = "https://github.com/cmu-db/optd"
2627
repository = "https://github.com/cmu-db/optd"
27-
rust-version = "1.70"
28+
# Specify MSRV here as `cargo msrv` doesn't support workspace version
29+
rust-version = "1.79"
2830
readme = "README.md"
2931

3032
[dependencies]
31-
arrow = "47.0.0"
32-
async-trait = "0.1.41"
33-
aws-config = "0.55"
34-
aws-credential-types = "0.55"
35-
clap = { version = "3", features = ["derive", "cargo"] }
36-
datafusion = { version = "32.0.0", features = [
33+
arrow = { version = "53.0.0" }
34+
async-trait = "0.1.73"
35+
aws-config = "1.5.5"
36+
aws-sdk-sso = "1.43.0"
37+
aws-sdk-ssooidc = "1.44.0"
38+
aws-sdk-sts = "1.43.0"
39+
# end pin aws-sdk crates
40+
aws-credential-types = "1.2.0"
41+
clap = { version = "4.5.16", features = ["derive", "cargo"] }
42+
datafusion = { version = "43.0.0", features = [
3743
"avro",
3844
"crypto_expressions",
45+
"datetime_expressions",
3946
"encoding_expressions",
47+
"parquet",
4048
"regex_expressions",
4149
"unicode_expressions",
4250
"compression",
4351
] }
44-
dirs = "4.0.0"
45-
env_logger = "0.9"
52+
dirs = "5.0.1"
53+
env_logger = "0.11"
54+
futures = "0.3"
4655
mimalloc = { version = "0.1", default-features = false }
47-
object_store = { version = "0.7.0", features = ["aws", "gcp"] }
56+
object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] }
4857
parking_lot = { version = "0.12" }
58+
parquet = { version = "53.0.0", default-features = false }
4959
regex = "1.8"
50-
rustyline = "11.0"
60+
rustyline = "14.0"
5161
tokio = { version = "1.24", features = [
5262
"macros",
5363
"rt",
5464
"rt-multi-thread",
5565
"sync",
5666
"parking_lot",
67+
"signal",
5768
] }
5869
url = "2.2"
70+
# begin optd-cli patch
5971
optd-datafusion-bridge = { path = "../optd-datafusion-bridge", version = "0.1" }
60-
optd-datafusion-repr-adv-cost = { path = "../optd-datafusion-repr-adv-cost", version = "0.1" }
61-
optd-datafusion-repr = { path = "../optd-datafusion-repr", version = "0.1" }
62-
tracing-subscriber = "0.3"
63-
tracing = "0.1"
72+
# end optd-cli patch
6473

6574
[dev-dependencies]
6675
assert_cmd = "2.0"
6776
ctor = "0.2.0"
6877
predicates = "3.0"
69-
rstest = "0.17"
78+
rstest = "0.22"

datafusion-optd-cli/Dockerfile

+9-8
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,23 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
FROM rust:1.70 as builder
18+
FROM rust:1.79-bookworm AS builder
1919

20-
COPY . /usr/src/arrow-datafusion
21-
COPY ./datafusion /usr/src/arrow-datafusion/datafusion
20+
COPY . /usr/src/datafusion
21+
COPY ./datafusion /usr/src/datafusion/datafusion
22+
COPY ./datafusion-cli /usr/src/datafusion/datafusion-cli
2223

23-
COPY ./datafusion-cli /usr/src/arrow-datafusion/datafusion-cli
24-
25-
WORKDIR /usr/src/arrow-datafusion/datafusion-cli
24+
WORKDIR /usr/src/datafusion/datafusion-cli
2625

2726
RUN rustup component add rustfmt
2827

2928
RUN cargo build --release
3029

31-
FROM debian:bullseye-slim
30+
FROM debian:bookworm-slim
31+
32+
COPY --from=builder /usr/src/datafusion/datafusion-cli/target/release/datafusion-cli /usr/local/bin
3233

33-
COPY --from=builder /usr/src/arrow-datafusion/datafusion-cli/target/release/datafusion-cli /usr/local/bin
34+
RUN mkdir /data
3435

3536
ENTRYPOINT ["datafusion-cli"]
3637

datafusion-optd-cli/README.md

+21-3
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,26 @@
2121

2222
# DataFusion Command-line Interface
2323

24-
[DataFusion](https://arrow.apache.org/datafusion/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
24+
[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
2525

26-
The DataFusion CLI is a command line utility that runs SQL queries using the DataFusion engine.
26+
DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine.
2727

28-
See the [`datafusion-cli` documentation](https://arrow.apache.org/datafusion/user-guide/cli.html) for further information.
28+
# Frequently Asked Questions
29+
30+
## Where can I find more information?
31+
32+
See the [`datafusion-cli` documentation](https://datafusion.apache.org/user-guide/cli/index.html) for further information.
33+
34+
## How do I make my IDE work with `datafusion-cli`?
35+
36+
"open" the `datafusion/datafusion-cli` project as its own top level
37+
project in my IDE (rather than opening `datafusion`)
38+
39+
The reason `datafusion-cli` is not part of the main workspace in
40+
[`datafusion Cargo.toml`] file is that `datafusion-cli` is a binary and has a
41+
checked in `Cargo.lock` file to ensure reproducible builds.
42+
43+
However, the `datafusion` and sub crates are intended for use as libraries and
44+
thus do not have a `Cargo.lock` file checked in.
45+
46+
[`datafusion cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Shows an example of a custom session context that unions the input plan with itself.
19+
//! To run this example, use `cargo run --example cli-session-context` from within the `datafusion-cli` directory.
20+
21+
use std::sync::Arc;
22+
23+
use datafusion::{
24+
dataframe::DataFrame,
25+
error::DataFusionError,
26+
execution::{context::SessionState, TaskContext},
27+
logical_expr::{LogicalPlan, LogicalPlanBuilder},
28+
prelude::SessionContext,
29+
};
30+
use datafusion_optd_cli::{
31+
cli_context::CliSessionContext, exec::exec_from_repl, print_options::PrintOptions,
32+
};
33+
use object_store::ObjectStore;
34+
35+
/// This is a toy example of a custom session context that unions the input plan with itself.
36+
struct MyUnionerContext {
37+
ctx: SessionContext,
38+
}
39+
40+
impl Default for MyUnionerContext {
41+
fn default() -> Self {
42+
Self {
43+
ctx: SessionContext::new(),
44+
}
45+
}
46+
}
47+
48+
#[async_trait::async_trait]
49+
impl CliSessionContext for MyUnionerContext {
50+
fn task_ctx(&self) -> Arc<TaskContext> {
51+
self.ctx.task_ctx()
52+
}
53+
54+
fn session_state(&self) -> SessionState {
55+
self.ctx.state()
56+
}
57+
58+
fn register_object_store(
59+
&self,
60+
url: &url::Url,
61+
object_store: Arc<dyn ObjectStore>,
62+
) -> Option<Arc<dyn ObjectStore + 'static>> {
63+
self.ctx.register_object_store(url, object_store)
64+
}
65+
66+
fn register_table_options_extension_from_scheme(&self, _scheme: &str) {
67+
unimplemented!()
68+
}
69+
70+
async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result<DataFrame, DataFusionError> {
71+
let new_plan = LogicalPlanBuilder::from(plan.clone())
72+
.union(plan.clone())?
73+
.build()?;
74+
75+
self.ctx.execute_logical_plan(new_plan).await
76+
}
77+
}
78+
79+
#[tokio::main]
80+
/// Runs the example.
81+
pub async fn main() {
82+
let my_ctx = MyUnionerContext::default();
83+
84+
let mut print_options = PrintOptions {
85+
format: datafusion_optd_cli::print_format::PrintFormat::Automatic,
86+
quiet: false,
87+
maxrows: datafusion_optd_cli::print_options::MaxRows::Unlimited,
88+
color: true,
89+
};
90+
91+
exec_from_repl(&my_ctx, &mut print_options).await.unwrap();
92+
}

0 commit comments

Comments
 (0)