fix: improvements

rustyconover · rustyconover · commit 0cb4becceda0 · 2025-07-10T17:04:26.000-04:00
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ This module provides predicate pushdown capabilities for file-based data storage
 - **Metadata-Based Filtering**: Support for both range-based (min/max) and set-based field metadata
 - **Null Handling**: Comprehensive support for `NULL` value semantics in SQL expressions
 - **Complex Predicates**: Handle `AND`, `OR`, `XOR`, `NOT`, `IN`, `BETWEEN`, `CASE` statements, and more
-- **Multiple Data Types**: Support for integers, floats, strings, decimals, and `NULL` values
+- **Multiple Data Types**: Support for integers, floats, strings, decimals, and `NULL` values.  Support for casting between Arrow scalar types.
 - **Dialect Support**: Configurable SQL dialect support (default: DuckDB)
 
 ## Installation
@@ -67,11 +67,11 @@ files = [
 planner = Planner(files)
 
 # Filter files based on SQL expressions
-matching_files = set(planner.get_matching_files("sales_amount > 40000 AND region = 'US'"))
+matching_files = set(planner.files("sales_amount > 40000 AND region = 'US'"))
 print(matching_files)  # {'data_2023_q1.parquet', 'data_2023_q2.parquet'}
 
 # More complex queries
-matching_files = set(planner.get_matching_files("region IN ('EU', 'UK')"))
+matching_files = set(planner.files("region IN ('EU', 'UK')"))
 print(matching_files)  # {'data_2023_q2.parquet'}
 ```
 
@@ -123,6 +123,9 @@ SetFieldInfo(
 ### Control Flow
 - `CASE WHEN ... THEN ... ELSE ... END` (conditional expressions)
 
+### Data Types
+- `CAST` (type casting)
+
 ### Literals
 - Numeric literals: `123`, `45.67`
 - String literals: `'hello'`
@@ -134,41 +137,41 @@ SetFieldInfo(
 ### Range Queries
 ```python
 # Files with sales between 1000 and 5000
-planner.get_matching_files("sales_amount BETWEEN 1000 AND 5000")
+planner.files("sales_amount BETWEEN 1000 AND 5000")
 
 # Files with any sales over 10000
-planner.get_matching_files("sales_amount > 10000")
+planner.files("sales_amount > 10000")
 ```
 
 ### Set Membership
 ```python
 # Files containing specific regions
-planner.get_matching_files("region IN ('US', 'CA')")
+planner.files("region IN ('US', 'CA')")
 
 # Files not containing specific regions
-planner.get_matching_files("region NOT IN ('UNKNOWN', 'TEST')")
+planner.files("region NOT IN ('UNKNOWN', 'TEST')")
 ```
 
 ### Complex Conditions
 ```python
 # Combination of range and set conditions
-planner.get_matching_files(
+planner.files(
     "sales_amount > 5000 AND region IN ('US', 'EU') AND customer_id IS NOT NULL"
 )
 
 # Case expressions
-planner.get_matching_files(
+planner.files(
     "CASE WHEN region = 'US' THEN sales_amount > 1000 ELSE sales_amount > 500 END"
 )
 ```
 
 ### Null Handling
 ```python
 # Files that might contain null values in sales_amount
-planner.get_matching_files("sales_amount IS NULL")
+planner.files("sales_amount IS NULL")
 
 # Files with non-null sales amounts over 1000
-planner.get_matching_files("sales_amount IS NOT NULL AND sales_amount > 1000")
+planner.files("sales_amount IS NOT NULL AND sales_amount > 1000")
 ```
 
 ## Performance Considerations
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 [project]
 name = "query-farm-sql-scan-planning"
-version = "0.1.5"
+version = "0.1.6"
 description = "A Python library for intelligent file filtering using SQL expressions and metadata-based scan planning. This library enables efficient data lake query optimization by determining which files need to be scanned based on their statistical metadata."
 authors = [
     { name = "Rusty Conover", email = "rusty@conover.me" }
 ]
 dependencies = [
     "sqlglot>=26.33.0",
-    "duckdb>=1.3.2",
-    "pyarrow>=20.0.0",
+    "duckdb>=1.0.0",
+    "pyarrow>=18.0.0",
 ]
 readme = "README.md"
 requires-python = ">= 3.11"
@@ -39,6 +39,7 @@ dev-dependencies = [
     "pytest-env>=1.1.3",
     "pytest-cov>=5.0.0",
     "ruff>=0.6.2",
+    "pdoc>=15.0.4",
 ]
 
 [tool.hatch.metadata]
diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -18,6 +18,11 @@ filelock==3.18.0
     # via pytest-mypy
 iniconfig==2.1.0
     # via pytest
+jinja2==3.1.6
+    # via pdoc
+markupsafe==3.0.2
+    # via jinja2
+    # via pdoc
 mypy==1.16.1
     # via pytest-mypy
 mypy-extensions==1.1.0
@@ -26,12 +31,14 @@ packaging==25.0
     # via pytest
 pathspec==0.12.1
     # via mypy
+pdoc==15.0.4
 pluggy==1.6.0
     # via pytest
     # via pytest-cov
 pyarrow==20.0.0
     # via query-farm-sql-scan-planning
 pygments==2.19.2
+    # via pdoc
     # via pytest
 pytest==8.4.1
     # via pytest-cov
diff --git a/src/query_farm_sql_scan_planning/planner.py b/src/query_farm_sql_scan_planning/planner.py
@@ -1,6 +1,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Generator
+from collections.abc import Container
 import duckdb
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -16,33 +17,36 @@ class BaseFieldInfo:
     """
 
     has_nulls: bool
+    """Whether the field has null values in this file."""
+
     has_non_nulls: bool
+    """Whether the field has non-null values in this file."""
 
 
 @dataclass
 class RangeFieldInfo(BaseFieldInfo):
     """
     Information about a field that has a min and max value.
+    This is used for range-based filtering in scan planning.
     """
 
-    min_value: pa.Scalar
-    max_value: pa.Scalar
+    min_value: pa.Scalar | None
+    """Minimum value in the field, can be None if the field is empty."""
+
+    max_value: pa.Scalar | None
+    """Maximum value in the field, can be None if the field is empty."""
 
 
 @dataclass
 class SetFieldInfo(BaseFieldInfo):
     """
     Information about a field where the set of values are known.
-    The information about what values that are contained can produce
-    false positives.
     """
 
-    values: set[
+    values: Container[
         pa.Scalar
     ]  # Set of values that are known to be present in the field, false positives are okay.
-
-
-AnyFieldInfo = SetFieldInfo | RangeFieldInfo
+    """A container of values that are known to be present in the field in this file."""
 
 
 def _scalar_value_op(
@@ -101,9 +105,7 @@ def _sv_eq(a: pa.Scalar, b: pa.Scalar) -> bool:
     return _scalar_value_op(a, b, lambda x, y: x == y)
 
 
-FileFieldInfo = dict[str, AnyFieldInfo]
-
-# When bailing out we should know why we bailed out if we couldn't evaluate the expression.
+FileFieldInfo = dict[str, SetFieldInfo | RangeFieldInfo]
 
 
 class Planner:
@@ -113,13 +115,12 @@ class Planner:
 
     def __init__(self, files: list[tuple[str, FileFieldInfo]]):
         """
-        Initialize with list of (filename, min_value, max_value) tuples.
-
-        Args:
-            file_ranges: List of tuples containing (filename, min_val, max_val)
+        Initialize with a list of (filename, FileFieldInfo) tuples.
         """
-        self.files = files
-        self.connection = duckdb.connect(":memory:")
+        self._files = files
+        """The list of files with their field information."""
+        self._connection = duckdb.connect(":memory:")
+        """DuckDB connection for evaluating scalar values."""
 
     def _eval_predicate(
         self,
@@ -162,7 +163,7 @@ def _eval_predicate(
 
         # The thing on the right side should be something that can be evaluated against a range.
         # ideally, its going to be a
-        value_result = self.connection.execute(
+        value_result = self._connection.execute(
             f"select {node.right.sql('duckdb')}"
         ).arrow()
         assert value_result.num_rows == 1, (
@@ -497,31 +498,29 @@ def _evaluate_sql_node(
                     f"Supported types: Connector, Predicate, Not, Boolean, Case, Null"
                 )
 
-    def get_matching_files(
-        self, exp: sqlglot.expressions.Expression | str, *, dialect: str = "duckdb"
+    def files(
+        self,
+        expression: sqlglot.expressions.Expression | str,
+        *,
+        dialect: str = "duckdb",
     ) -> Generator[str, None, None]:
         """
         Get a set of files that match the given SQL expression.
-        Args:
-            expression: The SQL expression to evaluate.
-            dialect: The SQL dialect to use for parsing the expression.
-            Returns:
-                A set of filenames that match the expression.
         """
-        if isinstance(exp, str):
+        if isinstance(expression, str):
             # Parse the expression if it is a string.
-            expression = sqlglot.parse_one(exp, dialect=dialect)
+            exp = sqlglot.parse_one(expression, dialect=dialect)
         else:
-            expression = exp
+            exp = expression
 
-        if not isinstance(expression, sqlglot.expressions.Expression):
-            raise ValueError(f"Expected a sqlglot expression, got {type(expression)}")
+        if not isinstance(exp, sqlglot.expressions.Expression):
+            raise ValueError(f"Expected a sqlglot expression, got {type(exp)}")
 
         # Simplify the parsed expression, move all of the literals to the right side
-        expression = sqlglot.optimizer.optimize(expression)
+        exp = sqlglot.optimizer.optimize(exp)
 
-        for filename, file_info in self.files:
-            eval_result = self._evaluate_sql_node(expression, file_info)
+        for filename, file_info in self._files:
+            eval_result = self._evaluate_sql_node(exp, file_info)
             if eval_result is None or eval_result is True:
                 # If the expression evaluates to True or cannot be evaluated, add the file
                 # to the result set since the caller will be able to filter the rows further.
diff --git a/src/query_farm_sql_scan_planning/test_planner.py b/src/query_farm_sql_scan_planning/test_planner.py
@@ -314,9 +314,7 @@ def test_scan_planning(
     filter_obj = Planner(sample_files)
 
     # Apply the filter
-    result = set(
-        filter_obj.get_matching_files(sqlglot.parse_one(clause, dialect="duckdb"))
-    )
+    result = set(filter_obj.files(sqlglot.parse_one(clause, dialect="duckdb")))
 
     # Check if files were filtered as expected
     if result != expected_files: