fix: change to arrow scalar types to accommodate more data types

rustyconover · rustyconover · commit bae5b35b16fd · 2025-07-10T12:26:14.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,8 @@ authors = [
 ]
 dependencies = [
     "sqlglot>=26.33.0",
+    "duckdb>=1.3.2",
+    "pyarrow>=20.0.0",
 ]
 readme = "README.md"
 requires-python = ">= 3.11"
diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -12,6 +12,8 @@
 -e file:.
 coverage==7.9.2
     # via pytest-cov
+duckdb==1.3.2
+    # via query-farm-sql-scan-planning
 filelock==3.18.0
     # via pytest-mypy
 iniconfig==2.1.0
@@ -27,6 +29,8 @@ pathspec==0.12.1
 pluggy==1.6.0
     # via pytest
     # via pytest-cov
+pyarrow==20.0.0
+    # via query-farm-sql-scan-planning
 pygments==2.19.2
     # via pytest
 pytest==8.4.1
@@ -37,7 +41,7 @@ pytest-cov==6.2.1
 pytest-env==1.1.5
 pytest-mypy==1.0.1
 ruff==0.12.2
-sqlglot==26.33.0
+sqlglot==27.0.0
     # via query-farm-sql-scan-planning
 typing-extensions==4.14.1
     # via mypy
diff --git a/requirements.lock b/requirements.lock
@@ -10,5 +10,9 @@
 #   universal: false
 
 -e file:.
-sqlglot==26.33.0
+duckdb==1.3.2
+    # via query-farm-sql-scan-planning
+pyarrow==20.0.0
+    # via query-farm-sql-scan-planning
+sqlglot==27.0.0
     # via query-farm-sql-scan-planning
diff --git a/src/query_farm_sql_scan_planning/planner.py b/src/query_farm_sql_scan_planning/planner.py
@@ -1,8 +1,8 @@
 from collections.abc import Callable
 from dataclasses import dataclass
-from decimal import Decimal
 from typing import Any
-
+import duckdb
+import pyarrow as pa
 import sqlglot
 import sqlglot.expressions
 import sqlglot.optimizer.simplify
@@ -19,36 +19,82 @@ class BaseFieldInfo:
 
 
 @dataclass
-class RangeFieldInfo[T: Any](BaseFieldInfo):
+class RangeFieldInfo(BaseFieldInfo):
     """
     Information about a field that has a min and max value.
     """
 
-    min_value: T
-    max_value: T
+    min_value: pa.Scalar
+    max_value: pa.Scalar
 
 
 @dataclass
-class SetFieldInfo[T: Any](BaseFieldInfo):
+class SetFieldInfo(BaseFieldInfo):
     """
     Information about a field where the set of values are known.
     The information about what values that are contained can produce
     false positives.
     """
 
     values: set[
-        T
+        pa.Scalar
     ]  # Set of values that are known to be present in the field, false positives are okay.
 
 
-AnyFieldInfo = (
-    SetFieldInfo[Decimal]
-    | SetFieldInfo[float]
-    | SetFieldInfo[str]
-    | SetFieldInfo[int]
-    | RangeFieldInfo[int]
-    | RangeFieldInfo[None]
-)
+AnyFieldInfo = SetFieldInfo | RangeFieldInfo
+
+
+def _scalar_value_op(
+    a: pa.Scalar, b: pa.Scalar, op: Callable[[Any, Any], bool]
+) -> bool:
+    assert not pa.types.is_null(a.type), (
+        f"Expected a non-null scalar value, got {a} of type {a.type}"
+    )
+    assert not pa.types.is_null(b.type), (
+        f"Expected a non-null scalar value, got {b} of type {b.type}"
+    )
+
+    # If we have integers or floats we can do that comparision regardless of their types.
+    if pa.types.is_integer(a.type) and pa.types.is_integer(b.type):
+        return op(a.as_py(), b.as_py())
+
+    if pa.types.is_floating(a.type) and pa.types.is_floating(b.type):
+        return op(a.as_py(), b.as_py())
+
+    if pa.types.is_string(a.type) and pa.types.is_string(b.type):
+        return op(a.as_py(), b.as_py())
+
+    if pa.types.is_boolean(a.type) and pa.types.is_boolean(b.type):
+        return op(a.as_py(), b.as_py())
+
+    if pa.types.is_decimal(a.type) and pa.types.is_decimal(b.type):
+        return op(a.as_py(), b.as_py())
+
+    assert type(a) is type(b), (
+        f"Expected same type for comparison, got {type(a)} and {type(b)}"
+    )
+
+    return op(a.as_py(), b.as_py())
+
+
+def _scalar_value_lte(a: pa.Scalar, b: pa.Scalar) -> bool:
+    return _scalar_value_op(a, b, lambda x, y: x <= y)
+
+
+def _scalar_value_lt(a: pa.Scalar, b: pa.Scalar) -> bool:
+    return _scalar_value_op(a, b, lambda x, y: x < y)
+
+
+def _scalar_value_gt(a: pa.Scalar, b: pa.Scalar) -> bool:
+    return _scalar_value_op(a, b, lambda x, y: x > y)
+
+
+def _scalar_value_gte(a: pa.Scalar, b: pa.Scalar) -> bool:
+    return _scalar_value_op(a, b, lambda x, y: x >= y)
+
+
+def _scalar_value_eq(a: pa.Scalar, b: pa.Scalar) -> bool:
+    return _scalar_value_op(a, b, lambda x, y: x == y)
 
 
 FileFieldInfo = dict[str, AnyFieldInfo]
@@ -93,18 +139,29 @@ def _eval_predicate(
         if not isinstance(node.left, sqlglot.expressions.Column):
             return None
 
+        if node.right.find(sqlglot.expressions.Column) is not None:
+            # Can't evaluate this since it has a right hand column ref, ideally
+            # this should be removed further up.
+            return None
+
         # The thing on the right side should be something that can be evaluated against a range.
         # ideally, its going to be a
-        assert isinstance(
-            node.right,
-            sqlglot.expressions.Literal
-            | sqlglot.expressions.Null
-            | sqlglot.expressions.Neg,
-        ), (
-            f"Expected a literal or null on righthand side of predicate {node} got a {type(node.right)}"
-        )
+        if True:  # isinstance(node.right, sqlglot.expressions.Cast):
+            connection = duckdb.connect(":memory:")
+            value_result = connection.execute(
+                f"select {node.right.sql('duckdb')}"
+            ).arrow()
+            assert value_result.num_rows == 1, (
+                f"Expected a single row result from cast, got {value_result.num_rows} rows"
+            )
+            assert value_result.num_columns == 1, (
+                f"Expected a single column result from cast, got {value_result.num_columns} columns"
+            )
 
-        right_val = node.right.to_py()
+            right_val = value_result.column(0)[0]
+            # This is an interesting behavior, null is returned with an int32 type.
+            if type(right_val) is pa.Int32Scalar and right_val.as_py() is None:
+                right_val = pa.scalar(None, type=pa.null())
 
         left_val = node.left
         assert isinstance(left_val, sqlglot.expressions.Column), (
@@ -117,17 +174,19 @@ def _eval_predicate(
 
         field_info = file_info.get(referenced_field_name)
 
+        # Right now if the field is not present in the file,
+        # just note that we couldn't evaluate the expression.
         if field_info is None:
             return None
 
         if isinstance(field_info, SetFieldInfo):
             match type(node):
                 case sqlglot.expressions.EQ:
-                    if right_val is None:
+                    if pa.types.is_null(right_val.type):
                         return False
                     return right_val in field_info.values
                 case sqlglot.expressions.NEQ:
-                    if right_val is None:
+                    if pa.types.is_null(right_val.type):
                         return False
                     return right_val not in field_info.values
                 case _:
@@ -136,44 +195,70 @@ def _eval_predicate(
                     )
 
         if type(node) is sqlglot.expressions.NullSafeNEQ:
-            if right_val is not None and field_info.has_non_nulls is False:
+            if (
+                not pa.types.is_null(right_val.type)
+                and field_info.has_non_nulls is False
+            ):
                 return True
-            return not (field_info.min_value == field_info.max_value == right_val)
+
+            if pa.types.is_null(right_val.type):
+                return field_info.has_non_nulls
+
+            return not (
+                _scalar_value_eq(field_info.min_value, field_info.max_value)
+                and _scalar_value_eq(field_info.min_value, right_val)
+            )
+
         elif type(node) is sqlglot.expressions.NullSafeEQ:
-            if right_val is None and field_info.has_non_nulls:
+            if pa.types.is_null(right_val.type) and field_info.has_non_nulls:
                 return True
             if field_info.min_value is None or field_info.max_value is None:
                 return False
-            assert right_val is not None
-            return field_info.min_value <= right_val <= field_info.max_value
+            assert not pa.types.is_null(right_val.type)
+            return _scalar_value_lte(
+                field_info.min_value, right_val
+            ) and _scalar_value_lte(right_val, field_info.max_value)
 
         if field_info.min_value is None or field_info.max_value is None:
             return False
 
-        if right_val is None:
+        if pa.types.is_null(right_val.type):
             return False
 
         match type(node):
             case sqlglot.expressions.EQ:
-                return field_info.min_value <= right_val <= field_info.max_value
+                return _scalar_value_lte(
+                    field_info.min_value, right_val
+                ) and _scalar_value_lte(right_val, field_info.max_value)
             case sqlglot.expressions.NEQ:
-                return not (field_info.min_value == field_info.max_value == right_val)
+                return not (
+                    _scalar_value_eq(field_info.min_value, field_info.max_value)
+                    and _scalar_value_eq(field_info.min_value, right_val)
+                )
             case sqlglot.expressions.LT:
-                return field_info.min_value < right_val
+                return _scalar_value_lt(field_info.min_value, right_val)
             case sqlglot.expressions.LTE:
-                return field_info.min_value <= right_val
+                return _scalar_value_lte(field_info.min_value, right_val)
             case sqlglot.expressions.GT:
-                return field_info.max_value > right_val
+                return _scalar_value_gt(field_info.max_value, right_val)
             case sqlglot.expressions.GTE:
-                return field_info.max_value >= right_val
+                return _scalar_value_gte(field_info.max_value, right_val)
             case sqlglot.expressions.NullSafeEQ:
-                if right_val is None and field_info.has_non_nulls:
+                if pa.types.is_null(right_val.type) and field_info.has_non_nulls:
                     return True
-                return field_info.min_value <= right_val <= field_info.max_value
+                return _scalar_value_lte(
+                    field_info.min_value, right_val
+                ) and _scalar_value_lte(right_val, field_info.max_value)
             case sqlglot.expressions.NullSafeNEQ:
-                if right_val is not None and field_info.has_non_nulls is False:
+                if (
+                    not pa.types.is_null(right_val.type)
+                    and field_info.has_non_nulls is False
+                ):
                     return True
-                return not (field_info.min_value == field_info.max_value == right_val)
+                return not (
+                    _scalar_value_eq(field_info.min_value, field_info.max_value)
+                    and _scalar_value_eq(field_info.min_value, right_val)
+                )
             case _:
                 raise ValueError(f"Unsupported operator type: {type(node)}")
 
@@ -234,14 +319,6 @@ def _evaluate_node_in(
             return False
 
         for in_exp in node.expressions:
-            assert isinstance(
-                in_exp,
-                sqlglot.expressions.Literal
-                | sqlglot.expressions.Neg
-                | sqlglot.expressions.Null,
-            ), (
-                f"Expected a literal in in side of {node}, got {in_exp} type {type(in_exp)}"
-            )
             if self._eval_predicate(
                 file_info,
                 sqlglot.expressions.EQ(this=in_val, expression=in_exp),
@@ -381,9 +458,7 @@ def _evaluate_sql_node(
 
         return False
 
-    def get_matching_files(
-        self, expression: str, *, dialect: str = "duckdb"
-    ) -> set[str]:
+    def get_matching_files(self, exp: sqlglot.expressions.Expression | str) -> set[str]:
         """
         Get a set of files that match the given SQL expression.
         Args:
@@ -392,15 +467,23 @@ def get_matching_files(
             Returns:
                 A set of filenames that match the expression.
         """
-        parse_result = sqlglot.parse_one(expression, dialect=dialect)
+        if isinstance(exp, str):
+            # Parse the expression if it is a string.
+            expression = sqlglot.parse_one(exp, dialect="duckdb")
+        else:
+            expression = exp
+
+        assert isinstance(expression, sqlglot.expressions.Expression), (
+            f"Expected a sqlglot expression, got {type(expression)}"
+        )
 
         # Simplify the parsed expression, move all of the literals to the right side
-        parse_result = sqlglot.optimizer.simplify.simplify(parse_result)
+        expression = sqlglot.optimizer.simplify.simplify(expression)
 
         matching_files = set()
 
         for filename, file_info in self.files:
-            eval_result = self._evaluate_sql_node(parse_result, file_info)
+            eval_result = self._evaluate_sql_node(expression, file_info)
             if eval_result is None or eval_result is True:
                 # If the expression evaluates to True or cannot be evaluated, add the file
                 # to the result set since the caller will be able to filter the rows further.
diff --git a/src/query_farm_sql_scan_planning/test_planner.py b/src/query_farm_sql_scan_planning/test_planner.py

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,8 @@ authors = [`
`7`	`7`	`]`
`8`	`8`	`dependencies = [`
`9`	`9`	`"sqlglot>=26.33.0",`
	`10`	`+ "duckdb>=1.3.2",`
	`11`	`+ "pyarrow>=20.0.0",`
`10`	`12`	`]`
`11`	`13`	`readme = "README.md"`
`12`	`14`	`requires-python = ">= 3.11"`