1
1
from collections .abc import Callable
2
2
from dataclasses import dataclass
3
3
from typing import Any , Generator
4
+ from collections .abc import Container
4
5
import duckdb
5
6
import pyarrow as pa
6
7
import pyarrow .compute as pc
@@ -16,33 +17,36 @@ class BaseFieldInfo:
16
17
"""
17
18
18
19
has_nulls : bool
20
+ """Whether the field has null values in this file."""
21
+
19
22
has_non_nulls : bool
23
+ """Whether the field has non-null values in this file."""
20
24
21
25
22
26
@dataclass
23
27
class RangeFieldInfo (BaseFieldInfo ):
24
28
"""
25
29
Information about a field that has a min and max value.
30
+ This is used for range-based filtering in scan planning.
26
31
"""
27
32
28
- min_value : pa .Scalar
29
- max_value : pa .Scalar
33
+ min_value : pa .Scalar | None
34
+ """Minimum value in the field, can be None if the field is empty."""
35
+
36
+ max_value : pa .Scalar | None
37
+ """Maximum value in the field, can be None if the field is empty."""
30
38
31
39
32
40
@dataclass
33
41
class SetFieldInfo (BaseFieldInfo ):
34
42
"""
35
43
Information about a field where the set of values are known.
36
- The information about what values that are contained can produce
37
- false positives.
38
44
"""
39
45
40
- values : set [
46
+ values : Container [
41
47
pa .Scalar
42
48
] # Set of values that are known to be present in the field, false positives are okay.
43
-
44
-
45
- AnyFieldInfo = SetFieldInfo | RangeFieldInfo
49
+ """A container of values that are known to be present in the field in this file."""
46
50
47
51
48
52
def _scalar_value_op (
@@ -101,9 +105,7 @@ def _sv_eq(a: pa.Scalar, b: pa.Scalar) -> bool:
101
105
return _scalar_value_op (a , b , lambda x , y : x == y )
102
106
103
107
104
- FileFieldInfo = dict [str , AnyFieldInfo ]
105
-
106
- # When bailing out we should know why we bailed out if we couldn't evaluate the expression.
108
+ FileFieldInfo = dict [str , SetFieldInfo | RangeFieldInfo ]
107
109
108
110
109
111
class Planner :
@@ -113,13 +115,12 @@ class Planner:
113
115
114
116
def __init__ (self , files : list [tuple [str , FileFieldInfo ]]):
115
117
"""
116
- Initialize with list of (filename, min_value, max_value) tuples.
117
-
118
- Args:
119
- file_ranges: List of tuples containing (filename, min_val, max_val)
118
+ Initialize with a list of (filename, FileFieldInfo) tuples.
120
119
"""
121
- self .files = files
122
- self .connection = duckdb .connect (":memory:" )
120
+ self ._files = files
121
+ """The list of files with their field information."""
122
+ self ._connection = duckdb .connect (":memory:" )
123
+ """DuckDB connection for evaluating scalar values."""
123
124
124
125
def _eval_predicate (
125
126
self ,
@@ -162,7 +163,7 @@ def _eval_predicate(
162
163
163
164
# The thing on the right side should be something that can be evaluated against a range.
164
165
# ideally, its going to be a
165
- value_result = self .connection .execute (
166
+ value_result = self ._connection .execute (
166
167
f"select { node .right .sql ('duckdb' )} "
167
168
).arrow ()
168
169
assert value_result .num_rows == 1 , (
@@ -497,31 +498,29 @@ def _evaluate_sql_node(
497
498
f"Supported types: Connector, Predicate, Not, Boolean, Case, Null"
498
499
)
499
500
500
- def get_matching_files (
501
- self , exp : sqlglot .expressions .Expression | str , * , dialect : str = "duckdb"
501
+ def files (
502
+ self ,
503
+ expression : sqlglot .expressions .Expression | str ,
504
+ * ,
505
+ dialect : str = "duckdb" ,
502
506
) -> Generator [str , None , None ]:
503
507
"""
504
508
Get a set of files that match the given SQL expression.
505
- Args:
506
- expression: The SQL expression to evaluate.
507
- dialect: The SQL dialect to use for parsing the expression.
508
- Returns:
509
- A set of filenames that match the expression.
510
509
"""
511
- if isinstance (exp , str ):
510
+ if isinstance (expression , str ):
512
511
# Parse the expression if it is a string.
513
- expression = sqlglot .parse_one (exp , dialect = dialect )
512
+ exp = sqlglot .parse_one (expression , dialect = dialect )
514
513
else :
515
- expression = exp
514
+ exp = expression
516
515
517
- if not isinstance (expression , sqlglot .expressions .Expression ):
518
- raise ValueError (f"Expected a sqlglot expression, got { type (expression )} " )
516
+ if not isinstance (exp , sqlglot .expressions .Expression ):
517
+ raise ValueError (f"Expected a sqlglot expression, got { type (exp )} " )
519
518
520
519
# Simplify the parsed expression, move all of the literals to the right side
521
- expression = sqlglot .optimizer .optimize (expression )
520
+ exp = sqlglot .optimizer .optimize (exp )
522
521
523
- for filename , file_info in self .files :
524
- eval_result = self ._evaluate_sql_node (expression , file_info )
522
+ for filename , file_info in self ._files :
523
+ eval_result = self ._evaluate_sql_node (exp , file_info )
525
524
if eval_result is None or eval_result is True :
526
525
# If the expression evaluates to True or cannot be evaluated, add the file
527
526
# to the result set since the caller will be able to filter the rows further.
0 commit comments