Speed up parquet import

FrancescAlted · FrancescAlted · commit 363ab01fe4c0 · 2026-05-04T07:49:18.000+02:00
diff --git a/src/blosc2/cli/parquet_to_blosc2.py b/src/blosc2/cli/parquet_to_blosc2.py
@@ -31,8 +31,11 @@
 import argparse
 import base64
 import contextlib
+import cProfile
 import gc
+import io
 import os
+import pstats
 import shutil
 import sys
 import time
@@ -142,7 +145,19 @@ def build_parser() -> argparse.ArgumentParser:
         default=None,
         help="Output path. Defaults depend on the mode and input path.",
     )
-    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--parquet-batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument(
+        "--batch-size",
+        dest="parquet_batch_size",
+        type=int,
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--blosc2-batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        help="Rows grouped into each persisted BatchArray batch for imported Blosc2 varlen/list columns.",
+    )
     parser.add_argument("--codec", type=str, default="ZSTD", choices=[c.name for c in blosc2.Codec])
     parser.add_argument("--clevel", type=int, default=5)
     parser.add_argument(
@@ -162,6 +177,11 @@ def build_parser() -> argparse.ArgumentParser:
         default=1,
         help="Print progress every N batches; the final batch is always reported.",
     )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Run the selected operation under cProfile and print cumulative timing stats.",
+    )
     parser.add_argument("--overwrite", action="store_true")
     return parser
 
@@ -327,7 +347,8 @@ def print_import_plan(
     print(f"  Skipped unsupported: {len(skipped)}")
     for name, entry in skipped.items():
         print(f"    - {name}: {entry['reason']}")
-    print(f"Batch size:            {args.batch_size:,}")
+    print(f"Parquet batch size:    {args.parquet_batch_size:,}")
+    print(f"Blosc2 batch size:     {args.blosc2_batch_size:,}")
     print(f"Codec / level:         {args.codec} / {args.clevel}")
     print()
 
@@ -337,7 +358,7 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols):
     t0 = time.perf_counter()
     total = pf.metadata.num_rows
     for batch_n, raw_batch in enumerate(
-        pf.iter_batches(batch_size=args.batch_size, columns=selected_cols), start=1
+        pf.iter_batches(batch_size=args.parquet_batch_size, columns=selected_cols), start=1
     ):
         report_batch_mem = args.mem_report and batch_n % args.mem_every == 0
         if report_batch_mem:
@@ -363,8 +384,10 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols):
 
 
 def import_parquet_to_ctable(args, input_path: Path, output_path: Path):
-    if args.batch_size <= 0:
-        raise ValueError("--batch-size must be positive")
+    if args.parquet_batch_size <= 0:
+        raise ValueError("--parquet-batch-size must be positive")
+    if args.blosc2_batch_size <= 0:
+        raise ValueError("--blosc2-batch-size must be positive")
     if args.mem_every <= 0:
         raise ValueError("--mem-every must be positive")
     if args.batch_report_every <= 0:
@@ -412,6 +435,7 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path):
         capacity_hint=pf.metadata.num_rows,
         string_max_length=None,
         auto_null_sentinels=True,
+        blosc2_batch_size=args.blosc2_batch_size,
     )
     maybe_memory_report(args, "after CTable import", pa)
     store_original_arrow_metadata(ct, parquet_schema, import_schema, conversions)
@@ -451,7 +475,7 @@ def unwrap_singleton_list(pa, arr, arrow_type):
 def export_ctable_to_parquet(input_path: Path, output_path: Path, *, batch_size: int, overwrite: bool):
     pa, pq = require_pyarrow()
     if batch_size <= 0:
-        raise ValueError("--batch-size must be positive")
+        raise ValueError("--parquet-batch-size must be positive")
     prepare_output(output_path, overwrite)
     ct = blosc2.CTable.open(str(input_path))
     original_schema = original_schema_from_ctable(pa, ct)
@@ -549,13 +573,12 @@ def assess_parquet_difference(original_path: Path, roundtrip_path: Path, exporte
     print(f"  Roundtrip size:      {roundtrip_path.stat().st_size / 1e6:.1f} MB")
 
 
-def main(argv: list[str] | None = None) -> int:
-    args = build_parser().parse_args(argv)
+def _run_command(args) -> int:
     if args.export:
         input_path = args.input_path
         output_path = args.output_path or _default_export_output(input_path)
         export_ctable_to_parquet(
-            input_path, output_path, batch_size=args.batch_size, overwrite=args.overwrite
+            input_path, output_path, batch_size=args.parquet_batch_size, overwrite=args.overwrite
         )
         return 0
     if args.roundtrip:
@@ -564,7 +587,7 @@ def main(argv: list[str] | None = None) -> int:
         roundtrip_path = _default_roundtrip_output(input_path)
         selected = import_parquet_to_ctable(args, input_path, b2_path)
         exported = export_ctable_to_parquet(
-            b2_path, roundtrip_path, batch_size=args.batch_size, overwrite=True
+            b2_path, roundtrip_path, batch_size=args.parquet_batch_size, overwrite=True
         )
         assess_parquet_difference(input_path, roundtrip_path, exported or selected)
         return 0
@@ -574,5 +597,41 @@ def main(argv: list[str] | None = None) -> int:
     return 0
 
 
+def _run_profiled(args) -> int:
+    profiler = cProfile.Profile()
+    profiler.enable()
+    try:
+        return _run_command(args)
+    finally:
+        profiler.disable()
+        stream = io.StringIO()
+        stats = pstats.Stats(profiler, stream=stream).sort_stats("cumulative")
+        stats.print_stats(50)
+        print("\n[cProfile] Top cumulative-time functions\n")
+        print(stream.getvalue().rstrip())
+
+
+def _option_present(argv: list[str], option: str) -> bool:
+    return any(arg == option or arg.startswith(option + "=") for arg in argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    argv = sys.argv[1:] if argv is None else list(argv)
+    args = build_parser().parse_args(argv)
+
+    parquet_specified = _option_present(argv, "--parquet-batch-size") or _option_present(
+        argv, "--batch-size"
+    )
+    blosc2_specified = _option_present(argv, "--blosc2-batch-size")
+    if parquet_specified and not blosc2_specified:
+        args.blosc2_batch_size = args.parquet_batch_size
+    elif blosc2_specified and not parquet_specified:
+        args.parquet_batch_size = args.blosc2_batch_size
+
+    if args.profile:
+        return _run_profiled(args)
+    return _run_command(args)
+
+
 if __name__ == "__main__":
     raise SystemExit(main())
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
@@ -1549,6 +1549,7 @@ def _fmt_bytes(n: int) -> str:
 
 
 _EXPECTED_SIZE_DEFAULT = 1_048_576
+_BATCH_SIZE_DEFAULT = 2048
 
 # ---------------------------------------------------------------------------
 # Computed-column definition (virtual columns backed by a LazyExpr)
@@ -2804,7 +2805,7 @@ def iter_arrow_batches(
         self,
         *,
         columns: list[str] | None = None,
-        batch_size: int = 65_536,
+        batch_size: int = _BATCH_SIZE_DEFAULT,
         include_computed: bool = True,
     ):
         """Yield live rows as bounded-size :class:`pyarrow.RecordBatch` objects."""
@@ -3123,7 +3124,11 @@ def _write_arrow_batch(cls, batch, columns, new_cols, new_valid, pos: int) -> in
             return pos
         for col in columns:
             arrow_col = batch.column(batch.schema.get_field_index(col.name))
-            if cls._is_list_column(col) or cls._is_varlen_scalar_column(col):
+            if cls._is_list_column(col):
+                # Trusted Arrow-import fast path: schema has already been inferred,
+                # so avoid Python-level per-item coercion/validation here.
+                new_cols[col.name].extend(arrow_col.to_pylist(), validate=False)
+            elif cls._is_varlen_scalar_column(col):
                 new_cols[col.name].extend(arrow_col.to_pylist())
             else:
                 new_cols[col.name][pos : pos + m] = cls._arrow_column_to_numpy(arrow_col, col)
@@ -3181,7 +3186,7 @@ def from_arrow(
         capacity_hint: int | None = None,
         string_max_length: int | None = None,
         auto_null_sentinels: bool = True,
-        list_batch_rows: int | None = 2048,
+        blosc2_batch_size: int | None = _BATCH_SIZE_DEFAULT,
     ) -> CTable:
         """Build a :class:`CTable` from an Arrow schema and iterable of record batches.
 
@@ -3196,13 +3201,14 @@ def from_arrow(
         :func:`~blosc2.string` / :func:`~blosc2.bytes` columns whose dtype is
         sized to *string_max_length* characters/bytes.
 
-        ``list_batch_rows`` controls how many rows are buffered before
-        list-valued columns are flushed to their backend.  Set it to ``None``
-        to keep list columns pending until the final flush.
+        ``blosc2_batch_size`` controls how many rows are buffered before
+        BatchArray-backed imported columns (list columns and varlen scalar
+        columns) are flushed to their backend.  Set it to ``None`` to keep
+        those columns pending until the final flush.
         """
         pa = cls._require_pyarrow("from_arrow()")
-        if list_batch_rows is not None and list_batch_rows <= 0:
-            raise ValueError("list_batch_rows must be a positive integer or None")
+        if blosc2_batch_size is not None and blosc2_batch_size <= 0:
+            raise ValueError("blosc2_batch_size must be a positive integer or None")
         batches = iter(batches)
         first_batch = None
         table_for_inference = None
@@ -3217,10 +3223,12 @@ def from_arrow(
             string_max_length,
             auto_null_sentinels=auto_null_sentinels,
         )
-        if list_batch_rows is not None:
+        if blosc2_batch_size is not None:
             for col in columns:
-                if cls._is_list_column(col) and getattr(col.spec, "storage", None) == "batch":
-                    col.spec.batch_rows = list_batch_rows
+                if (
+                    cls._is_list_column(col) and getattr(col.spec, "storage", None) == "batch"
+                ) or cls._is_varlen_scalar_column(col):
+                    col.spec.batch_rows = blosc2_batch_size
         compiled = CompiledSchema(
             row_cls=None,
             columns=columns,
@@ -3253,7 +3261,7 @@ def to_parquet(
         path,
         *,
         columns: list[str] | None = None,
-        batch_size: int = 65_536,
+        batch_size: int = _BATCH_SIZE_DEFAULT,
         compression: str | None = "zstd",
         row_group_size: int | None = None,
         include_computed: bool = True,
@@ -3277,14 +3285,14 @@ def from_parquet(
         path,
         *,
         columns: list[str] | None = None,
-        batch_size: int = 65_536,
+        batch_size: int = _BATCH_SIZE_DEFAULT,
         urlpath: str | None = None,
         mode: str = "w",
         cparams=None,
         dparams=None,
         validate: bool = False,
         auto_null_sentinels: bool = True,
-        list_batch_rows: int | None = 2048,
+        blosc2_batch_size: int | None = _BATCH_SIZE_DEFAULT,
         **kwargs,
     ) -> CTable:
         """Read a Parquet file into a :class:`CTable` batch-wise using pyarrow."""
@@ -3311,7 +3319,7 @@ def from_parquet(
             capacity_hint=pf.metadata.num_rows if pf.metadata is not None else None,
             string_max_length=string_max_length,
             auto_null_sentinels=auto_null_sentinels,
-            list_batch_rows=list_batch_rows,
+            blosc2_batch_size=blosc2_batch_size,
         )
 
     # ------------------------------------------------------------------
diff --git a/src/blosc2/list_array.py b/src/blosc2/list_array.py
@@ -354,7 +354,9 @@ def extend(self, values: Iterable[Any], *, validate: bool = True) -> None:
         if validate:
             cells = [coerce_list_cell(self.spec, v) for v in values]
         else:
-            cells = [v if v is not None else [] for v in values]
+            # Trusted fast path used by Arrow/Parquet import and internal row reordering.
+            # Preserve nullable list cells as native None and skip all per-item coercion.
+            cells = list(values)
         if self.spec.storage == "vl":
             self._backend.extend(iter(cells))
             self._persisted_row_count = len(self._backend)
@@ -662,7 +664,7 @@ def from_arrow(
             items_per_block=items_per_block,
             **kwargs,
         )
-        arr.extend(arrow_array.to_pylist())
+        arr.extend(arrow_array.to_pylist(), validate=False)
         return arr
 
     def __enter__(self) -> ListArray:
diff --git a/tests/ctable/test_arrow_interop.py b/tests/ctable/test_arrow_interop.py
@@ -230,6 +230,32 @@ def test_from_arrow_string_fixed_width_with_max_length():
     assert t["name"][:].tolist() == ["hi", "hello world", "!"]
 
 
+def test_from_arrow_list_struct_nullable_values_roundtrip():
+    nutrient_type = pa.struct(
+        [
+            pa.field("name", pa.string()),
+            pa.field("value", pa.float64()),
+        ]
+    )
+    at = pa.table(
+        {
+            "id": pa.array([1, 2, 3], type=pa.int64()),
+            "nutriments": pa.array(
+                [
+                    [{"name": "fat", "value": 1.5}, {"name": "salt", "value": 0.2}],
+                    None,
+                    [{"name": "energy", "value": 42.0}],
+                ],
+                type=pa.list_(nutrient_type),
+            ),
+        }
+    )
+    t = CTable.from_arrow(at.schema, at.to_batches())
+    assert t[0].nutriments == [{"name": "fat", "value": 1.5}, {"name": "salt", "value": 0.2}]
+    assert t[1].nutriments is None
+    assert t[2].nutriments == [{"name": "energy", "value": 42.0}]
+
+
 def test_from_arrow_unsupported_type_raises():
     at = pa.table({"ts": pa.array([1, 2, 3], type=pa.timestamp("s"))})
     with pytest.raises(TypeError, match="No blosc2 spec"):
diff --git a/tests/ctable/test_parquet_interop.py b/tests/ctable/test_parquet_interop.py
@@ -342,25 +342,25 @@ def test_interop_write_with_pyarrow(self, tmp_path):
         assert len(t) == 3
         assert t.col_names == ["x", "y"]
 
-    def test_from_arrow_list_batch_rows_default(self):
+    def test_from_arrow_blosc2_batch_size_default(self):
         at = pa.table({"vals": pa.array([[1], [2, 3]], type=pa.list_(pa.int64()))})
         t = CTable.from_arrow(at.schema, at.to_batches())
         assert t._schema.columns_by_name["vals"].spec.batch_rows == 2048
         assert t["vals"][0] == [1]
         assert t["vals"][1] == [2, 3]
 
-    def test_from_arrow_list_batch_rows_override_and_none(self):
+    def test_from_arrow_blosc2_batch_size_override_and_none(self):
         at = pa.table({"vals": pa.array([[1], [2], [3]], type=pa.list_(pa.int64()))})
-        t = CTable.from_arrow(at.schema, at.to_batches(max_chunksize=1), list_batch_rows=2)
+        t = CTable.from_arrow(at.schema, at.to_batches(max_chunksize=1), blosc2_batch_size=2)
         assert t._schema.columns_by_name["vals"].spec.batch_rows == 2
 
-        t2 = CTable.from_arrow(at.schema, at.to_batches(max_chunksize=1), list_batch_rows=None)
+        t2 = CTable.from_arrow(at.schema, at.to_batches(max_chunksize=1), blosc2_batch_size=None)
         assert t2._schema.columns_by_name["vals"].spec.batch_rows is None
 
-    def test_from_arrow_invalid_list_batch_rows_raises(self):
+    def test_from_arrow_invalid_blosc2_batch_size_raises(self):
         at = pa.table({"vals": pa.array([[1]], type=pa.list_(pa.int64()))})
-        with pytest.raises(ValueError, match="list_batch_rows"):
-            CTable.from_arrow(at.schema, at.to_batches(), list_batch_rows=0)
+        with pytest.raises(ValueError, match="blosc2_batch_size"):
+            CTable.from_arrow(at.schema, at.to_batches(), blosc2_batch_size=0)
 
     def test_vlstring_arrow_roundtrip_no_singleton_list(self):
         """Scalar string columns import as vlstring (not list<string>) without singleton wrapping."""
diff --git a/tests/test_list_array.py b/tests/test_list_array.py
@@ -82,3 +82,9 @@ def test_listarray_arrow_roundtrip():
     arr = blosc2.ListArray.from_arrow(values, item_spec=blosc2.string(), nullable=True)
     assert arr[:] == [["a"], None, ["b", "c"]]
     assert arr.to_arrow().to_pylist() == [["a"], None, ["b", "c"]]
+
+
+def test_listarray_extend_validate_false_preserves_none():
+    arr = blosc2.ListArray(item_spec=blosc2.int32(), nullable=True, storage="batch", batch_rows=2)
+    arr.extend([[1], None, [2, 3]], validate=False)
+    assert arr[:] == [[1], None, [2, 3]]