Skip to content

Commit 1391078

Browse files
authored
feat: upgrade df48 dependency (#1143)
* Upgrade to DF 48 * Update unit test * Resolve clippy warnings * Update wrapper test to look for __repr__ special function * Add __repr__ where missing * Error in return of __repr__ * Remove patch now that DF48 is released * Expose lit_with_metadata and add unit test
1 parent dc0d35a commit 1391078

16 files changed

+325
-145
lines changed

Cargo.lock

Lines changed: 100 additions & 84 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ substrait = ["dep:datafusion-substrait"]
3737
tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] }
3838
pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
3939
pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
40-
arrow = { version = "55.0.0", features = ["pyarrow"] }
41-
datafusion = { version = "47.0.0", features = ["avro", "unicode_expressions"] }
42-
datafusion-substrait = { version = "47.0.0", optional = true }
43-
datafusion-proto = { version = "47.0.0" }
44-
datafusion-ffi = { version = "47.0.0" }
40+
arrow = { version = "55.1.0", features = ["pyarrow"] }
41+
datafusion = { version = "48.0.0", features = ["avro", "unicode_expressions"] }
42+
datafusion-substrait = { version = "48.0.0", optional = true }
43+
datafusion-proto = { version = "48.0.0" }
44+
datafusion-ffi = { version = "48.0.0" }
4545
prost = "0.13.1" # keep in line with `datafusion-substrait`
4646
uuid = { version = "1.16", features = ["v4"] }
4747
mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }

python/datafusion/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
See https://datafusion.apache.org/python for more information.
2222
"""
2323

24+
from __future__ import annotations
25+
26+
from typing import Any
27+
2428
try:
2529
import importlib.metadata as importlib_metadata
2630
except ImportError:
@@ -130,3 +134,18 @@ def str_lit(value):
130134
def lit(value) -> Expr:
131135
"""Create a literal expression."""
132136
return Expr.literal(value)
137+
138+
139+
def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
140+
"""Creates a new expression representing a scalar value with metadata.
141+
142+
Args:
143+
value: A valid PyArrow scalar value or easily castable to one.
144+
metadata: Metadata to attach to the expression.
145+
"""
146+
return Expr.literal_with_metadata(value, metadata)
147+
148+
149+
def lit_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
150+
"""Alias for literal_with_metadata."""
151+
return literal_with_metadata(value, metadata)

python/datafusion/catalog.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def __init__(self, catalog: df_internal.Catalog) -> None:
3434
"""This constructor is not typically called by the end user."""
3535
self.catalog = catalog
3636

37+
def __repr__(self) -> str:
38+
"""Print a string representation of the catalog."""
39+
return self.catalog.__repr__()
40+
3741
def names(self) -> list[str]:
3842
"""Returns the list of databases in this catalog."""
3943
return self.catalog.names()
@@ -50,6 +54,10 @@ def __init__(self, db: df_internal.Database) -> None:
5054
"""This constructor is not typically called by the end user."""
5155
self.db = db
5256

57+
def __repr__(self) -> str:
58+
"""Print a string representation of the database."""
59+
return self.db.__repr__()
60+
5361
def names(self) -> set[str]:
5462
"""Returns the list of all tables in this database."""
5563
return self.db.names()
@@ -66,6 +74,10 @@ def __init__(self, table: df_internal.Table) -> None:
6674
"""This constructor is not typically called by the end user."""
6775
self.table = table
6876

77+
def __repr__(self) -> str:
78+
"""Print a string representation of the table."""
79+
return self.table.__repr__()
80+
6981
@property
7082
def schema(self) -> pa.Schema:
7183
"""Returns the schema associated with this table."""

python/datafusion/context.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,10 @@ def __init__(
496496

497497
self.ctx = SessionContextInternal(config, runtime)
498498

499+
def __repr__(self) -> str:
500+
"""Print a string representation of the Session Context."""
501+
return self.ctx.__repr__()
502+
499503
@classmethod
500504
def global_ctx(cls) -> SessionContext:
501505
"""Retrieve the global context as a `SessionContext` wrapper.

python/datafusion/expr.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,20 @@ def literal(value: Any) -> Expr:
435435
value = pa.scalar(value)
436436
return Expr(expr_internal.RawExpr.literal(value))
437437

438+
@staticmethod
439+
def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
440+
"""Creates a new expression representing a scalar value with metadata.
441+
442+
Args:
443+
value: A valid PyArrow scalar value or easily castable to one.
444+
metadata: Metadata to attach to the expression.
445+
"""
446+
if isinstance(value, str):
447+
value = pa.scalar(value, type=pa.string_view())
448+
value = value if isinstance(value, pa.Scalar) else pa.scalar(value)
449+
450+
return Expr(expr_internal.RawExpr.literal_with_metadata(value, metadata))
451+
438452
@staticmethod
439453
def string_literal(value: str) -> Expr:
440454
"""Creates a new expression representing a UTF8 literal value.
@@ -1172,6 +1186,10 @@ def __init__(
11721186
end_bound = end_bound.cast(pa.uint64())
11731187
self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound)
11741188

1189+
def __repr__(self) -> str:
1190+
"""Print a string representation of the window frame."""
1191+
return self.window_frame.__repr__()
1192+
11751193
def get_frame_units(self) -> str:
11761194
"""Returns the window frame units for the bounds."""
11771195
return self.window_frame.get_frame_units()

python/datafusion/user_defined.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ def __init__(
102102
name, func, input_types, return_type, str(volatility)
103103
)
104104

105+
def __repr__(self) -> str:
106+
"""Print a string representation of the Scalar UDF."""
107+
return self._udf.__repr__()
108+
105109
def __call__(self, *args: Expr) -> Expr:
106110
"""Execute the UDF.
107111
@@ -268,6 +272,10 @@ def __init__(
268272
str(volatility),
269273
)
270274

275+
def __repr__(self) -> str:
276+
"""Print a string representation of the Aggregate UDF."""
277+
return self._udaf.__repr__()
278+
271279
def __call__(self, *args: Expr) -> Expr:
272280
"""Execute the UDAF.
273281
@@ -604,6 +612,10 @@ def __init__(
604612
name, func, input_types, return_type, str(volatility)
605613
)
606614

615+
def __repr__(self) -> str:
616+
"""Print a string representation of the Window UDF."""
617+
return self._udwf.__repr__()
618+
607619
def __call__(self, *args: Expr) -> Expr:
608620
"""Execute the UDWF.
609621

python/tests/test_expr.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,14 @@
1919

2020
import pyarrow as pa
2121
import pytest
22-
from datafusion import SessionContext, col, functions, lit
22+
from datafusion import (
23+
SessionContext,
24+
col,
25+
functions,
26+
lit,
27+
lit_with_metadata,
28+
literal_with_metadata,
29+
)
2330
from datafusion.expr import (
2431
Aggregate,
2532
AggregateFunction,
@@ -103,7 +110,7 @@ def test_limit(test_ctx):
103110

104111
plan = plan.to_variant()
105112
assert isinstance(plan, Limit)
106-
assert "Skip: Some(Literal(Int64(5)))" in str(plan)
113+
assert "Skip: Some(Literal(Int64(5), None))" in str(plan)
107114

108115

109116
def test_aggregate_query(test_ctx):
@@ -824,3 +831,52 @@ def test_expr_functions(ctx, function, expected_result):
824831

825832
assert len(result) == 1
826833
assert result[0].column(0).equals(expected_result)
834+
835+
836+
def test_literal_metadata(ctx):
837+
result = (
838+
ctx.from_pydict({"a": [1]})
839+
.select(
840+
lit(1).alias("no_metadata"),
841+
lit_with_metadata(2, {"key1": "value1"}).alias("lit_with_metadata_fn"),
842+
literal_with_metadata(3, {"key2": "value2"}).alias(
843+
"literal_with_metadata_fn"
844+
),
845+
)
846+
.collect()
847+
)
848+
849+
expected_schema = pa.schema(
850+
[
851+
pa.field("no_metadata", pa.int64(), nullable=False),
852+
pa.field(
853+
"lit_with_metadata_fn",
854+
pa.int64(),
855+
nullable=False,
856+
metadata={"key1": "value1"},
857+
),
858+
pa.field(
859+
"literal_with_metadata_fn",
860+
pa.int64(),
861+
nullable=False,
862+
metadata={"key2": "value2"},
863+
),
864+
]
865+
)
866+
867+
expected = pa.RecordBatch.from_pydict(
868+
{
869+
"no_metadata": pa.array([1]),
870+
"lit_with_metadata_fn": pa.array([2]),
871+
"literal_with_metadata_fn": pa.array([3]),
872+
},
873+
schema=expected_schema,
874+
)
875+
876+
assert result[0] == expected
877+
878+
# Testing result[0].schema == expected_schema does not check each key/value pair
879+
# so we want to explicitly test these
880+
for expected_field in expected_schema:
881+
actual_field = result[0].schema.field(expected_field.name)
882+
assert expected_field.metadata == actual_field.metadata

python/tests/test_wrapper_coverage.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@
2828
from enum import EnumMeta as EnumType
2929

3030

31-
def missing_exports(internal_obj, wrapped_obj) -> None:
31+
def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901
3232
"""
3333
Identify if any of the rust exposted structs or functions do not have wrappers.
3434
3535
Special handling for:
3636
- Raw* classes: Internal implementation details that shouldn't be exposed
3737
- _global_ctx: Internal implementation detail
38-
- __self__, __class__: Python special attributes
38+
- __self__, __class__, __repr__: Python special attributes
3939
"""
4040
# Special case enums - EnumType overrides a some of the internal functions,
4141
# so check all of the values exist and move on
@@ -45,6 +45,9 @@ def missing_exports(internal_obj, wrapped_obj) -> None:
4545
assert value in dir(wrapped_obj)
4646
return
4747

48+
if "__repr__" in internal_obj.__dict__ and "__repr__" not in wrapped_obj.__dict__:
49+
pytest.fail(f"Missing __repr__: {internal_obj.__name__}")
50+
4851
for internal_attr_name in dir(internal_obj):
4952
wrapped_attr_name = internal_attr_name.removeprefix("Raw")
5053
assert wrapped_attr_name in dir(wrapped_obj)

src/context.rs

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ use datafusion::datasource::TableProvider;
6161
use datafusion::execution::context::{
6262
DataFilePaths, SQLOptions, SessionConfig, SessionContext, TaskContext,
6363
};
64-
use datafusion::execution::disk_manager::DiskManagerConfig;
64+
use datafusion::execution::disk_manager::DiskManagerMode;
6565
use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool};
6666
use datafusion::execution::options::ReadOptions;
6767
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
@@ -183,22 +183,49 @@ impl PyRuntimeEnvBuilder {
183183
}
184184

185185
fn with_disk_manager_disabled(&self) -> Self {
186-
let mut builder = self.builder.clone();
187-
builder = builder.with_disk_manager(DiskManagerConfig::Disabled);
188-
Self { builder }
186+
let mut runtime_builder = self.builder.clone();
187+
188+
let mut disk_mgr_builder = runtime_builder
189+
.disk_manager_builder
190+
.clone()
191+
.unwrap_or_default();
192+
disk_mgr_builder.set_mode(DiskManagerMode::Disabled);
193+
194+
runtime_builder = runtime_builder.with_disk_manager_builder(disk_mgr_builder);
195+
Self {
196+
builder: runtime_builder,
197+
}
189198
}
190199

191200
fn with_disk_manager_os(&self) -> Self {
192-
let builder = self.builder.clone();
193-
let builder = builder.with_disk_manager(DiskManagerConfig::NewOs);
194-
Self { builder }
201+
let mut runtime_builder = self.builder.clone();
202+
203+
let mut disk_mgr_builder = runtime_builder
204+
.disk_manager_builder
205+
.clone()
206+
.unwrap_or_default();
207+
disk_mgr_builder.set_mode(DiskManagerMode::OsTmpDirectory);
208+
209+
runtime_builder = runtime_builder.with_disk_manager_builder(disk_mgr_builder);
210+
Self {
211+
builder: runtime_builder,
212+
}
195213
}
196214

197215
fn with_disk_manager_specified(&self, paths: Vec<String>) -> Self {
198-
let builder = self.builder.clone();
199216
let paths = paths.iter().map(|s| s.into()).collect();
200-
let builder = builder.with_disk_manager(DiskManagerConfig::NewSpecified(paths));
201-
Self { builder }
217+
let mut runtime_builder = self.builder.clone();
218+
219+
let mut disk_mgr_builder = runtime_builder
220+
.disk_manager_builder
221+
.clone()
222+
.unwrap_or_default();
223+
disk_mgr_builder.set_mode(DiskManagerMode::Directories(paths));
224+
225+
runtime_builder = runtime_builder.with_disk_manager_builder(disk_mgr_builder);
226+
Self {
227+
builder: runtime_builder,
228+
}
202229
}
203230

204231
fn with_unbounded_memory_pool(&self) -> Self {

0 commit comments

Comments
 (0)