Skip to content

Commit 70d89ca

Browse files
authored
Merge pull request #2976 from mabel-dev/#2975
Fix NoneType issue with list checks, and hour filtering for Mabel
2 parents 1e083a8 + 13546ea commit 70d89ca

10 files changed

Lines changed: 384 additions & 21 deletions

File tree

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,7 @@ distclean: clean ## Deep clean including compiled extensions
168168
all: clean dev-install lint mypy test compile ## Run complete development workflow
169169

170170
check-all: lint mypy test coverage ## Run all checks without compilation
171+
172+
loc: ## Count LOC for production code only (excludes tests)
173+
$(call print_blue,'Counting LOC for production files (excluding tests)')
174+
@$(PYTHON) dev/count_loc_basic.py --exclude build,temp,third_party,dev,scratch,tests --ext py,pyx,c,cpp,cc,cxx,h,hpp --per-file

dev/build_counter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class VersionStatus(Enum):
2727
__minor_version__ = 26
2828
__revision_version__ = 2
2929
__author__ = "@joocer"
30-
__status__ = VersionStatus.BETA
30+
__status__ = VersionStatus.RELEASE
3131

3232
__build__ = None
3333
with open(f"{LIBRARY_NAME}/__version__.py", mode="r", encoding="utf-8") as v:

dev/count_loc_basic.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Basic LOC counter for this repo.
4+
Counts non-blank, non-comment lines for file extensions typically used in this repo:
5+
- .py .pyx .c .cpp .cc .cxx .h .hpp
6+
7+
Comments are identified only if the line starts (after whitespace) with `#` or `//`.
8+
This is intentionally simple / fast.
9+
10+
Usage:
11+
python dev/count_loc_basic.py [--root ROOT] [--exclude DIR1,DIR2] [--ext py,pyx,c,cpp,h] [--top N]
12+
13+
The script prints a total and per-language breakdown.
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import argparse
19+
from collections import defaultdict
20+
from pathlib import Path
21+
from typing import Iterable
22+
from typing import List
23+
from typing import Set
24+
from typing import Tuple
25+
26+
DEFAULT_EXTS = ["py", "pyx", "c", "cpp", "cc", "cxx", "h", "hpp"]
27+
DEFAULT_EXCLUDES = {"build", "temp", "third_party", "dev", "dist", "scratch"}
28+
29+
30+
def parse_args():
31+
p = argparse.ArgumentParser(description="Basic LOC counter (non-blank, non-comment lines)")
32+
p.add_argument("--root", default=".", help="Root directory to scan")
33+
p.add_argument(
34+
"--exclude",
35+
default=','.join(sorted(DEFAULT_EXCLUDES)),
36+
help=f"Comma-separated list of directory names to exclude. Default: {','.join(sorted(DEFAULT_EXCLUDES))}",
37+
)
38+
p.add_argument(
39+
"--ext",
40+
default=','.join(DEFAULT_EXTS),
41+
help=f"Comma-separated extensions to include (no leading dot). Default: {','.join(DEFAULT_EXTS)}",
42+
)
43+
p.add_argument(
44+
"--top",
45+
type=int,
46+
default=10,
47+
help="Show top N files by LOC (default 10)",
48+
)
49+
p.add_argument(
50+
"--per-file",
51+
action="store_true",
52+
help="Show counts per file in addition to summary",
53+
)
54+
return p.parse_args()
55+
56+
57+
def should_skip(path: Path, exclude_parts: Set[str]) -> bool:
58+
"""Return True if any component of `path` is in exclude_parts."""
59+
return any(part in exclude_parts for part in path.parts)
60+
61+
62+
def find_files(root: Path, exts: Set[str], exclude_parts: Set[str]) -> Iterable[Path]:
63+
for path in root.rglob("*"):
64+
if path.is_file():
65+
if should_skip(path, exclude_parts):
66+
continue
67+
if path.suffix:
68+
suf = path.suffix[1:]
69+
if suf in exts:
70+
yield path
71+
72+
73+
def count_file(path: Path) -> int:
74+
cnt = 0
75+
try:
76+
with path.open("r", errors="replace") as fh:
77+
for line in fh:
78+
if not line.strip():
79+
continue
80+
s = line.lstrip()
81+
if s.startswith("#") or s.startswith("//"):
82+
continue
83+
cnt += 1
84+
except (OSError, UnicodeDecodeError):
85+
# If we can't read a file for whatever reason, just skip it and return 0
86+
return 0
87+
return cnt
88+
89+
90+
def group_by_ext(path: Path) -> str:
91+
suf = path.suffix[1:]
92+
if suf in ("py", "pyx"):
93+
return "Python/Cython"
94+
if suf in ("c",):
95+
return "C"
96+
if suf in ("cpp", "cc", "cxx"):
97+
return "C++"
98+
if suf in ("h", "hpp"):
99+
return "Header"
100+
return suf
101+
102+
103+
def main():
104+
args = parse_args()
105+
root = Path(args.root).resolve()
106+
exts = {e.strip() for e in args.ext.split(",") if e.strip()}
107+
excludes = {p.strip() for p in args.exclude.split(",") if p.strip()}
108+
109+
files = list(find_files(root, exts, excludes))
110+
111+
per_file_counts: List[Tuple[Path, int]] = []
112+
ext_totals: defaultdict[str, int] = defaultdict(int)
113+
114+
total = 0
115+
for p in files:
116+
c = count_file(p)
117+
per_file_counts.append((p, c))
118+
total += c
119+
ext_totals[group_by_ext(p)] += c
120+
121+
print("LOC Summary (non-blank, non-comment lines)")
122+
print(f"Root: {root}")
123+
print(f"Files scanned: {len(files)}")
124+
print(f"Total LOC: {total}")
125+
print("")
126+
print("Breakdown by language:")
127+
for k in sorted(ext_totals.keys()):
128+
print(f" {k:12s}: {ext_totals[k]}")
129+
130+
if args.per_file:
131+
print("")
132+
print("Top files by LOC:")
133+
per_file_counts.sort(key=lambda t: t[1], reverse=True)
134+
for p, c in per_file_counts[: args.top]:
135+
print(f" {c:6d} {p}")
136+
137+
138+
if __name__ == "__main__":
139+
main()

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1974
4+
__build__ = 1976
55
__author__ = "@joocer"
6-
__version__ = "0.26.2-beta.1974"
6+
__version__ = "0.26.2"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/structures/memory_pool.pyx

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ cdef class MemoryPool:
167167

168168
return best_index
169169

170-
cdef void _merge_adjacent_free_segments(self):
170+
cdef inline void _merge_adjacent_free_segments(self):
171171
"""Merge adjacent free segments (Level 1 compaction)."""
172172
if self.segments.size() <= 1:
173173
return
@@ -610,9 +610,6 @@ cdef class MemoryPool:
610610
self.segments[segment_index] = segment
611611
self.used_size -= segment.length
612612

613-
# Try to merge adjacent free segments
614-
self._merge_adjacent_free_segments()
615-
616613
def available_space(self) -> int64_t:
617614
cdef int64_t total_free = 0
618615
with self.lock:

opteryx/connectors/sql_connector.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,32 @@ def collect_relation_stats(self) -> RelationStatistics:
490490
# Try with full dataset name first
491491
columns = inspect(self._engine).get_columns(self.dataset)
492492
except Exception:
493-
# Fall back to table name only
494-
columns = inspect(self._engine).get_columns(table_name_only)
493+
try:
494+
# Fall back to table name only
495+
columns = inspect(self._engine).get_columns(table_name_only)
496+
except Exception:
497+
# Some SQLAlchemy/DBAPI combinations (notably certain
498+
# versions of the duckdb engine) attempt to run
499+
# Postgres-specific catalog queries during reflection
500+
# which may not be present in the underlying engine and
501+
# will raise. In that case, gracefully fall back to a
502+
# lightweight stats query (COUNT) so we can still
503+
# provide basic relation statistics instead of failing
504+
# the whole schema discovery.
505+
try:
506+
quoted_dataset = self._quote_dataset_name(self.dataset)
507+
with self._engine.connect() as conn:
508+
result = conn.execute(
509+
text(f"SELECT COUNT(*) AS count FROM {quoted_dataset}")
510+
).fetchone()
511+
count = result[0] if result is not None else None
512+
if count is not None:
513+
stats.record_count = int(count)
514+
stats.record_count_estimate = int(count)
515+
return stats
516+
except Exception:
517+
# Give up and return empty stats rather than raising
518+
return stats
495519

496520
declared_types = self._get_declared_column_types(table_name_only)
497521

opteryx/managers/expression/ops.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -334,18 +334,31 @@ def check_json_pointer(doc, pointer):
334334
if operator == "AtArrow":
335335
from opteryx.compiled.list_ops import list_contains_any
336336

337-
to_pylist = getattr(value, "to_pylist", None)
338-
if to_pylist is not None:
339-
value = to_pylist()
337+
if len(arr) == 0:
338+
return numpy.array([], dtype=numpy.bool_)
339+
340+
if len(arr) == 1:
341+
# Fixed: Handle None element
342+
elem = arr[0]
343+
if elem is None:
344+
return numpy.array([False], dtype=numpy.bool_)
345+
346+
value_set = set(value) if value is not None else set()
347+
try:
348+
elem_set = set(elem)
349+
except TypeError:
350+
elem_set = {elem}
351+
352+
result = bool(elem_set.intersection(value_set))
353+
return numpy.array([result], dtype=numpy.bool_)
340354

341355
to_numpy = getattr(arr, "to_numpy", None)
342356
if to_numpy is not None:
343357
arr = to_numpy(zero_copy_only=False)
344358

345-
if len(arr) == 0:
346-
return numpy.array([], dtype=numpy.bool_)
347-
if len(arr) == 1:
348-
return numpy.array([set(arr[0]).intersection(value)], dtype=numpy.bool_)
359+
to_pylist = getattr(value, "to_pylist", None)
360+
if to_pylist is not None:
361+
value = to_pylist()
349362

350363
return list_contains_any(arr, set(value))
351364

opteryx/managers/schemes/mabel_partitions.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,19 @@ def _inner(*, date, start, end):
122122
raise UnsupportedSegementationError(dataset=prefix, segment=segment)
123123

124124
if any(f"{OS_SEP}by_hour{OS_SEP}" in blob_name for blob_name in data_blobs):
125-
start = min(start, date)
126-
end = max(end, date)
125+
# Calculate the time range for this specific day
126+
day_start = date.replace(hour=0, minute=0, second=0, microsecond=0)
127+
day_end = (
128+
day_start + datetime.timedelta(days=1) - datetime.timedelta(microseconds=1)
129+
)
130+
131+
# Intersect with the global query range
132+
loop_start = max(start, day_start)
133+
loop_end = min(end, day_end)
127134

128135
selected_blobs = []
129136

130-
for hour in date_range(start, end, "1h"):
137+
for hour in date_range(loop_start, loop_end, "1h"):
131138
hour_label = f"{OS_SEP}by_hour{OS_SEP}hour={hour.hour:02d}/"
132139
# Filter for the specific hour, if hour folders exist
133140
if any(hour_label in blob_name for blob_name in data_blobs):
@@ -152,7 +159,11 @@ def _inner(*, date, start, end):
152159
# Prepare a list of future tasks
153160
futures = [
154161
executor.submit(_inner, **{"date": date, "start": start_date, "end": end_date})
155-
for date in date_range(start_date, end_date, "1d")
162+
for date in date_range(
163+
start_date.replace(hour=0, minute=0, second=0, microsecond=0),
164+
end_date,
165+
"1d",
166+
)
156167
]
157168
# Wait for all futures to complete and collect results
158169
for future in concurrent.futures.as_completed(futures):

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.2-beta.1974"
3+
version = "0.26.2"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

0 commit comments

Comments
 (0)