Merge pull request #2976 from mabel-dev/#2975

joocer · web-flow · commit 70d89caddab8 · 2025-12-10T22:51:29.000Z
Fix NoneType issue with list checks, and hour filtering for Mabel
diff --git a/Makefile b/Makefile
@@ -168,3 +168,7 @@ distclean: clean ## Deep clean including compiled extensions
 all: clean dev-install lint mypy test compile ## Run complete development workflow
 
 check-all: lint mypy test coverage ## Run all checks without compilation
+
+loc: ## Count LOC for production code only (excludes tests)
+	$(call print_blue,'Counting LOC for production files (excluding tests)')
+	@$(PYTHON) dev/count_loc_basic.py --exclude build,temp,third_party,dev,scratch,tests --ext py,pyx,c,cpp,cc,cxx,h,hpp --per-file
diff --git a/dev/build_counter.py b/dev/build_counter.py
@@ -27,7 +27,7 @@ class VersionStatus(Enum):
 __minor_version__ = 26
 __revision_version__ = 2
 __author__ = "@joocer"
-__status__ = VersionStatus.BETA
+__status__ = VersionStatus.RELEASE
 
 __build__ = None
 with open(f"{LIBRARY_NAME}/__version__.py", mode="r", encoding="utf-8") as v:
diff --git a/dev/count_loc_basic.py b/dev/count_loc_basic.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Basic LOC counter for this repo.
+Counts non-blank, non-comment lines for file extensions typically used in this repo:
+- .py .pyx .c .cpp .cc .cxx .h .hpp
+
+Comments are identified only if the line starts (after whitespace) with `#` or `//`.
+This is intentionally simple / fast.
+
+Usage:
+    python dev/count_loc_basic.py [--root ROOT] [--exclude DIR1,DIR2] [--ext py,pyx,c,cpp,h] [--top N]
+
+The script prints a total and per-language breakdown.
+"""
+
+from __future__ import annotations
+
+import argparse
+from collections import defaultdict
+from pathlib import Path
+from typing import Iterable
+from typing import List
+from typing import Set
+from typing import Tuple
+
+DEFAULT_EXTS = ["py", "pyx", "c", "cpp", "cc", "cxx", "h", "hpp"]
+DEFAULT_EXCLUDES = {"build", "temp", "third_party", "dev", "dist", "scratch"}
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Basic LOC counter (non-blank, non-comment lines)")
+    p.add_argument("--root", default=".", help="Root directory to scan")
+    p.add_argument(
+        "--exclude",
+        default=','.join(sorted(DEFAULT_EXCLUDES)),
+        help=f"Comma-separated list of directory names to exclude. Default: {','.join(sorted(DEFAULT_EXCLUDES))}",
+    )
+    p.add_argument(
+        "--ext",
+        default=','.join(DEFAULT_EXTS),
+        help=f"Comma-separated extensions to include (no leading dot). Default: {','.join(DEFAULT_EXTS)}",
+    )
+    p.add_argument(
+        "--top",
+        type=int,
+        default=10,
+        help="Show top N files by LOC (default 10)",
+    )
+    p.add_argument(
+        "--per-file",
+        action="store_true",
+        help="Show counts per file in addition to summary",
+    )
+    return p.parse_args()
+
+
+def should_skip(path: Path, exclude_parts: Set[str]) -> bool:
+    """Return True if any component of `path` is in exclude_parts."""
+    return any(part in exclude_parts for part in path.parts)
+
+
+def find_files(root: Path, exts: Set[str], exclude_parts: Set[str]) -> Iterable[Path]:
+    for path in root.rglob("*"):
+        if path.is_file():
+            if should_skip(path, exclude_parts):
+                continue
+            if path.suffix:
+                suf = path.suffix[1:]
+                if suf in exts:
+                    yield path
+
+
+def count_file(path: Path) -> int:
+    cnt = 0
+    try:
+        with path.open("r", errors="replace") as fh:
+            for line in fh:
+                if not line.strip():
+                    continue
+                s = line.lstrip()
+                if s.startswith("#") or s.startswith("//"):
+                    continue
+                cnt += 1
+    except (OSError, UnicodeDecodeError):
+        # If we can't read a file for whatever reason, just skip it and return 0
+        return 0
+    return cnt
+
+
+def group_by_ext(path: Path) -> str:
+    suf = path.suffix[1:]
+    if suf in ("py", "pyx"):
+        return "Python/Cython"
+    if suf in ("c",):
+        return "C"
+    if suf in ("cpp", "cc", "cxx"):
+        return "C++"
+    if suf in ("h", "hpp"):
+        return "Header"
+    return suf
+
+
+def main():
+    args = parse_args()
+    root = Path(args.root).resolve()
+    exts = {e.strip() for e in args.ext.split(",") if e.strip()}
+    excludes = {p.strip() for p in args.exclude.split(",") if p.strip()}
+
+    files = list(find_files(root, exts, excludes))
+
+    per_file_counts: List[Tuple[Path, int]] = []
+    ext_totals: defaultdict[str, int] = defaultdict(int)
+
+    total = 0
+    for p in files:
+        c = count_file(p)
+        per_file_counts.append((p, c))
+        total += c
+        ext_totals[group_by_ext(p)] += c
+
+    print("LOC Summary (non-blank, non-comment lines)")
+    print(f"Root: {root}")
+    print(f"Files scanned: {len(files)}")
+    print(f"Total LOC: {total}")
+    print("")
+    print("Breakdown by language:")
+    for k in sorted(ext_totals.keys()):
+        print(f"  {k:12s}: {ext_totals[k]}")
+
+    if args.per_file:
+        print("")
+        print("Top files by LOC:")
+        per_file_counts.sort(key=lambda t: t[1], reverse=True)
+        for p, c in per_file_counts[: args.top]:
+            print(f"  {c:6d} {p}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1974
+__build__ = 1976
 __author__ = "@joocer"
-__version__ = "0.26.2-beta.1974"
+__version__ = "0.26.2"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/compiled/structures/memory_pool.pyx b/opteryx/compiled/structures/memory_pool.pyx
@@ -167,7 +167,7 @@ cdef class MemoryPool:
 
         return best_index
 
-    cdef void _merge_adjacent_free_segments(self):
+    cdef inline void _merge_adjacent_free_segments(self):
         """Merge adjacent free segments (Level 1 compaction)."""
         if self.segments.size() <= 1:
             return
@@ -610,9 +610,6 @@ cdef class MemoryPool:
             self.segments[segment_index] = segment
             self.used_size -= segment.length
 
-            # Try to merge adjacent free segments
-            self._merge_adjacent_free_segments()
-
     def available_space(self) -> int64_t:
         cdef int64_t total_free = 0
         with self.lock:
diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py
@@ -490,8 +490,32 @@ def collect_relation_stats(self) -> RelationStatistics:
                 # Try with full dataset name first
                 columns = inspect(self._engine).get_columns(self.dataset)
             except Exception:
-                # Fall back to table name only
-                columns = inspect(self._engine).get_columns(table_name_only)
+                try:
+                    # Fall back to table name only
+                    columns = inspect(self._engine).get_columns(table_name_only)
+                except Exception:
+                    # Some SQLAlchemy/DBAPI combinations (notably certain
+                    # versions of the duckdb engine) attempt to run
+                    # Postgres-specific catalog queries during reflection
+                    # which may not be present in the underlying engine and
+                    # will raise. In that case, gracefully fall back to a
+                    # lightweight stats query (COUNT) so we can still
+                    # provide basic relation statistics instead of failing
+                    # the whole schema discovery.
+                    try:
+                        quoted_dataset = self._quote_dataset_name(self.dataset)
+                        with self._engine.connect() as conn:
+                            result = conn.execute(
+                                text(f"SELECT COUNT(*) AS count FROM {quoted_dataset}")
+                            ).fetchone()
+                        count = result[0] if result is not None else None
+                        if count is not None:
+                            stats.record_count = int(count)
+                            stats.record_count_estimate = int(count)
+                        return stats
+                    except Exception:
+                        # Give up and return empty stats rather than raising
+                        return stats
 
             declared_types = self._get_declared_column_types(table_name_only)
 
diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py
@@ -334,18 +334,31 @@ def check_json_pointer(doc, pointer):
     if operator == "AtArrow":
         from opteryx.compiled.list_ops import list_contains_any
 
-        to_pylist = getattr(value, "to_pylist", None)
-        if to_pylist is not None:
-            value = to_pylist()
+        if len(arr) == 0:
+            return numpy.array([], dtype=numpy.bool_)
+
+        if len(arr) == 1:
+            # Fixed: Handle None element
+            elem = arr[0]
+            if elem is None:
+                return numpy.array([False], dtype=numpy.bool_)
+
+            value_set = set(value) if value is not None else set()
+            try:
+                elem_set = set(elem)
+            except TypeError:
+                elem_set = {elem}
+
+            result = bool(elem_set.intersection(value_set))
+            return numpy.array([result], dtype=numpy.bool_)
 
         to_numpy = getattr(arr, "to_numpy", None)
         if to_numpy is not None:
             arr = to_numpy(zero_copy_only=False)
 
-        if len(arr) == 0:
-            return numpy.array([], dtype=numpy.bool_)
-        if len(arr) == 1:
-            return numpy.array([set(arr[0]).intersection(value)], dtype=numpy.bool_)
+        to_pylist = getattr(value, "to_pylist", None)
+        if to_pylist is not None:
+            value = to_pylist()
 
         return list_contains_any(arr, set(value))
 
diff --git a/opteryx/managers/schemes/mabel_partitions.py b/opteryx/managers/schemes/mabel_partitions.py
@@ -122,12 +122,19 @@ def _inner(*, date, start, end):
                             raise UnsupportedSegementationError(dataset=prefix, segment=segment)
 
             if any(f"{OS_SEP}by_hour{OS_SEP}" in blob_name for blob_name in data_blobs):
-                start = min(start, date)
-                end = max(end, date)
+                # Calculate the time range for this specific day
+                day_start = date.replace(hour=0, minute=0, second=0, microsecond=0)
+                day_end = (
+                    day_start + datetime.timedelta(days=1) - datetime.timedelta(microseconds=1)
+                )
+
+                # Intersect with the global query range
+                loop_start = max(start, day_start)
+                loop_end = min(end, day_end)
 
                 selected_blobs = []
 
-                for hour in date_range(start, end, "1h"):
+                for hour in date_range(loop_start, loop_end, "1h"):
                     hour_label = f"{OS_SEP}by_hour{OS_SEP}hour={hour.hour:02d}/"
                     # Filter for the specific hour, if hour folders exist
                     if any(hour_label in blob_name for blob_name in data_blobs):
@@ -152,7 +159,11 @@ def _inner(*, date, start, end):
             # Prepare a list of future tasks
             futures = [
                 executor.submit(_inner, **{"date": date, "start": start_date, "end": end_date})
-                for date in date_range(start_date, end_date, "1d")
+                for date in date_range(
+                    start_date.replace(hour=0, minute=0, second=0, microsecond=0),
+                    end_date,
+                    "1d",
+                )
             ]
             # Wait for all futures to complete and collect results
             for future in concurrent.futures.as_completed(futures):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.2-beta.1974"
+version = "0.26.2"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/unit/managers/test_mabel_partitions.py b/tests/unit/managers/test_mabel_partitions.py