Skip to content

Commit 0cb4bec

Browse files
committed
fix: improvements
1 parent 5c6a259 commit 0cb4bec

File tree

5 files changed

+58
-50
lines changed

5 files changed

+58
-50
lines changed

README.md

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This module provides predicate pushdown capabilities for file-based data storage
1212
- **Metadata-Based Filtering**: Support for both range-based (min/max) and set-based field metadata
1313
- **Null Handling**: Comprehensive support for `NULL` value semantics in SQL expressions
1414
- **Complex Predicates**: Handle `AND`, `OR`, `XOR`, `NOT`, `IN`, `BETWEEN`, `CASE` statements, and more
15-
- **Multiple Data Types**: Support for integers, floats, strings, decimals, and `NULL` values
15+
- **Multiple Data Types**: Support for integers, floats, strings, decimals, and `NULL` values. Support for casting between Arrow scalar types.
1616
- **Dialect Support**: Configurable SQL dialect support (default: DuckDB)
1717

1818
## Installation
@@ -67,11 +67,11 @@ files = [
6767
planner = Planner(files)
6868

6969
# Filter files based on SQL expressions
70-
matching_files = set(planner.get_matching_files("sales_amount > 40000 AND region = 'US'"))
70+
matching_files = set(planner.files("sales_amount > 40000 AND region = 'US'"))
7171
print(matching_files) # {'data_2023_q1.parquet', 'data_2023_q2.parquet'}
7272

7373
# More complex queries
74-
matching_files = set(planner.get_matching_files("region IN ('EU', 'UK')"))
74+
matching_files = set(planner.files("region IN ('EU', 'UK')"))
7575
print(matching_files) # {'data_2023_q2.parquet'}
7676
```
7777

@@ -123,6 +123,9 @@ SetFieldInfo(
123123
### Control Flow
124124
- `CASE WHEN ... THEN ... ELSE ... END` (conditional expressions)
125125

126+
### Data Types
127+
- `CAST` (type casting)
128+
126129
### Literals
127130
- Numeric literals: `123`, `45.67`
128131
- String literals: `'hello'`
@@ -134,41 +137,41 @@ SetFieldInfo(
134137
### Range Queries
135138
```python
136139
# Files with sales between 1000 and 5000
137-
planner.get_matching_files("sales_amount BETWEEN 1000 AND 5000")
140+
planner.files("sales_amount BETWEEN 1000 AND 5000")
138141

139142
# Files with any sales over 10000
140-
planner.get_matching_files("sales_amount > 10000")
143+
planner.files("sales_amount > 10000")
141144
```
142145

143146
### Set Membership
144147
```python
145148
# Files containing specific regions
146-
planner.get_matching_files("region IN ('US', 'CA')")
149+
planner.files("region IN ('US', 'CA')")
147150

148151
# Files not containing specific regions
149-
planner.get_matching_files("region NOT IN ('UNKNOWN', 'TEST')")
152+
planner.files("region NOT IN ('UNKNOWN', 'TEST')")
150153
```
151154

152155
### Complex Conditions
153156
```python
154157
# Combination of range and set conditions
155-
planner.get_matching_files(
158+
planner.files(
156159
"sales_amount > 5000 AND region IN ('US', 'EU') AND customer_id IS NOT NULL"
157160
)
158161

159162
# Case expressions
160-
planner.get_matching_files(
163+
planner.files(
161164
"CASE WHEN region = 'US' THEN sales_amount > 1000 ELSE sales_amount > 500 END"
162165
)
163166
```
164167

165168
### Null Handling
166169
```python
167170
# Files that might contain null values in sales_amount
168-
planner.get_matching_files("sales_amount IS NULL")
171+
planner.files("sales_amount IS NULL")
169172

170173
# Files with non-null sales amounts over 1000
171-
planner.get_matching_files("sales_amount IS NOT NULL AND sales_amount > 1000")
174+
planner.files("sales_amount IS NOT NULL AND sales_amount > 1000")
172175
```
173176

174177
## Performance Considerations

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
[project]
22
name = "query-farm-sql-scan-planning"
3-
version = "0.1.5"
3+
version = "0.1.6"
44
description = "A Python library for intelligent file filtering using SQL expressions and metadata-based scan planning. This library enables efficient data lake query optimization by determining which files need to be scanned based on their statistical metadata."
55
authors = [
66
{ name = "Rusty Conover", email = "[email protected]" }
77
]
88
dependencies = [
99
"sqlglot>=26.33.0",
10-
"duckdb>=1.3.2",
11-
"pyarrow>=20.0.0",
10+
"duckdb>=1.0.0",
11+
"pyarrow>=18.0.0",
1212
]
1313
readme = "README.md"
1414
requires-python = ">= 3.11"
@@ -39,6 +39,7 @@ dev-dependencies = [
3939
"pytest-env>=1.1.3",
4040
"pytest-cov>=5.0.0",
4141
"ruff>=0.6.2",
42+
"pdoc>=15.0.4",
4243
]
4344

4445
[tool.hatch.metadata]

requirements-dev.lock

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ filelock==3.18.0
1818
# via pytest-mypy
1919
iniconfig==2.1.0
2020
# via pytest
21+
jinja2==3.1.6
22+
# via pdoc
23+
markupsafe==3.0.2
24+
# via jinja2
25+
# via pdoc
2126
mypy==1.16.1
2227
# via pytest-mypy
2328
mypy-extensions==1.1.0
@@ -26,12 +31,14 @@ packaging==25.0
2631
# via pytest
2732
pathspec==0.12.1
2833
# via mypy
34+
pdoc==15.0.4
2935
pluggy==1.6.0
3036
# via pytest
3137
# via pytest-cov
3238
pyarrow==20.0.0
3339
# via query-farm-sql-scan-planning
3440
pygments==2.19.2
41+
# via pdoc
3542
# via pytest
3643
pytest==8.4.1
3744
# via pytest-cov

src/query_farm_sql_scan_planning/planner.py

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections.abc import Callable
22
from dataclasses import dataclass
33
from typing import Any, Generator
4+
from collections.abc import Container
45
import duckdb
56
import pyarrow as pa
67
import pyarrow.compute as pc
@@ -16,33 +17,36 @@ class BaseFieldInfo:
1617
"""
1718

1819
has_nulls: bool
20+
"""Whether the field has null values in this file."""
21+
1922
has_non_nulls: bool
23+
"""Whether the field has non-null values in this file."""
2024

2125

2226
@dataclass
2327
class RangeFieldInfo(BaseFieldInfo):
2428
"""
2529
Information about a field that has a min and max value.
30+
This is used for range-based filtering in scan planning.
2631
"""
2732

28-
min_value: pa.Scalar
29-
max_value: pa.Scalar
33+
min_value: pa.Scalar | None
34+
"""Minimum value in the field, can be None if the field is empty."""
35+
36+
max_value: pa.Scalar | None
37+
"""Maximum value in the field, can be None if the field is empty."""
3038

3139

3240
@dataclass
3341
class SetFieldInfo(BaseFieldInfo):
3442
"""
3543
Information about a field where the set of values are known.
36-
The information about what values that are contained can produce
37-
false positives.
3844
"""
3945

40-
values: set[
46+
values: Container[
4147
pa.Scalar
4248
] # Set of values that are known to be present in the field, false positives are okay.
43-
44-
45-
AnyFieldInfo = SetFieldInfo | RangeFieldInfo
49+
"""A container of values that are known to be present in the field in this file."""
4650

4751

4852
def _scalar_value_op(
@@ -101,9 +105,7 @@ def _sv_eq(a: pa.Scalar, b: pa.Scalar) -> bool:
101105
return _scalar_value_op(a, b, lambda x, y: x == y)
102106

103107

104-
FileFieldInfo = dict[str, AnyFieldInfo]
105-
106-
# When bailing out we should know why we bailed out if we couldn't evaluate the expression.
108+
FileFieldInfo = dict[str, SetFieldInfo | RangeFieldInfo]
107109

108110

109111
class Planner:
@@ -113,13 +115,12 @@ class Planner:
113115

114116
def __init__(self, files: list[tuple[str, FileFieldInfo]]):
115117
"""
116-
Initialize with list of (filename, min_value, max_value) tuples.
117-
118-
Args:
119-
file_ranges: List of tuples containing (filename, min_val, max_val)
118+
Initialize with a list of (filename, FileFieldInfo) tuples.
120119
"""
121-
self.files = files
122-
self.connection = duckdb.connect(":memory:")
120+
self._files = files
121+
"""The list of files with their field information."""
122+
self._connection = duckdb.connect(":memory:")
123+
"""DuckDB connection for evaluating scalar values."""
123124

124125
def _eval_predicate(
125126
self,
@@ -162,7 +163,7 @@ def _eval_predicate(
162163

163164
# The thing on the right side should be something that can be evaluated against a range.
164165
# ideally, its going to be a
165-
value_result = self.connection.execute(
166+
value_result = self._connection.execute(
166167
f"select {node.right.sql('duckdb')}"
167168
).arrow()
168169
assert value_result.num_rows == 1, (
@@ -497,31 +498,29 @@ def _evaluate_sql_node(
497498
f"Supported types: Connector, Predicate, Not, Boolean, Case, Null"
498499
)
499500

500-
def get_matching_files(
501-
self, exp: sqlglot.expressions.Expression | str, *, dialect: str = "duckdb"
501+
def files(
502+
self,
503+
expression: sqlglot.expressions.Expression | str,
504+
*,
505+
dialect: str = "duckdb",
502506
) -> Generator[str, None, None]:
503507
"""
504508
Get a set of files that match the given SQL expression.
505-
Args:
506-
expression: The SQL expression to evaluate.
507-
dialect: The SQL dialect to use for parsing the expression.
508-
Returns:
509-
A set of filenames that match the expression.
510509
"""
511-
if isinstance(exp, str):
510+
if isinstance(expression, str):
512511
# Parse the expression if it is a string.
513-
expression = sqlglot.parse_one(exp, dialect=dialect)
512+
exp = sqlglot.parse_one(expression, dialect=dialect)
514513
else:
515-
expression = exp
514+
exp = expression
516515

517-
if not isinstance(expression, sqlglot.expressions.Expression):
518-
raise ValueError(f"Expected a sqlglot expression, got {type(expression)}")
516+
if not isinstance(exp, sqlglot.expressions.Expression):
517+
raise ValueError(f"Expected a sqlglot expression, got {type(exp)}")
519518

520519
# Simplify the parsed expression, move all of the literals to the right side
521-
expression = sqlglot.optimizer.optimize(expression)
520+
exp = sqlglot.optimizer.optimize(exp)
522521

523-
for filename, file_info in self.files:
524-
eval_result = self._evaluate_sql_node(expression, file_info)
522+
for filename, file_info in self._files:
523+
eval_result = self._evaluate_sql_node(exp, file_info)
525524
if eval_result is None or eval_result is True:
526525
# If the expression evaluates to True or cannot be evaluated, add the file
527526
# to the result set since the caller will be able to filter the rows further.

src/query_farm_sql_scan_planning/test_planner.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,7 @@ def test_scan_planning(
314314
filter_obj = Planner(sample_files)
315315

316316
# Apply the filter
317-
result = set(
318-
filter_obj.get_matching_files(sqlglot.parse_one(clause, dialect="duckdb"))
319-
)
317+
result = set(filter_obj.files(sqlglot.parse_one(clause, dialect="duckdb")))
320318

321319
# Check if files were filtered as expected
322320
if result != expected_files:

0 commit comments

Comments
 (0)