@@ -3338,7 +3338,119 @@ def from_parquet(
33383338 blosc2_items_per_block : int | None = None ,
33393339 ** kwargs ,
33403340 ) -> CTable :
3341- """Read a Parquet file into a :class:`CTable` batch-wise using pyarrow."""
3341+ """Read a Parquet file into a :class:`CTable`.
3342+
3343+ The Parquet file is streamed batch by batch through :mod:`pyarrow` and then
3344+ converted into a typed :class:`CTable`. By default, the result is created in
3345+ memory, but you can also persist it on disk via ``urlpath``.
3346+
3347+ This method delegates the actual table construction to
3348+ :meth:`CTable.from_arrow`, so Arrow schema handling, nullable-column support,
3349+ and Blosc2 write tuning follow the same rules as that method.
3350+
3351+ Parameters
3352+ ----------
3353+ path : str or path-like
3354+ Path to the source Parquet file.
3355+
3356+ columns : list[str] or None, optional
3357+ Subset of columns to read from the Parquet file. If provided, only these
3358+ columns are loaded and their order in the resulting table matches the
3359+ order in this list. Column names must be unique.
3360+
3361+ batch_size : int, optional
3362+ Number of rows per Arrow batch read from the Parquet file. This controls
3363+ how much data is pulled from the file at a time before being handed off
3364+ to the CTable builder. Must be greater than 0.
3365+
3366+ urlpath : str or None, optional
3367+ Destination storage path for the resulting CTable. If ``None`` (the
3368+ default), the table is created in memory. If provided, the table is backed
3369+ by persistent on-disk storage.
3370+
3371+ mode : str, optional
3372+ Storage open mode for ``urlpath``. Defaults to ``"w"``. This is passed
3373+ through to :meth:`CTable.from_arrow`.
3374+
3375+ cparams : object, optional
3376+ Compression parameters for the created Blosc2 containers. Passed through
3377+ to :meth:`CTable.from_arrow`.
3378+
3379+ dparams : object, optional
3380+ Decompression parameters for the created Blosc2 containers. Passed through
3381+ to :meth:`CTable.from_arrow`.
3382+
3383+ validate : bool, optional
3384+ Whether to enable extra internal validation while building the table.
3385+ Defaults to ``False``.
3386+
3387+ auto_null_sentinels : bool, optional
3388+ If ``True`` (default), nullable scalar columns imported from Parquet may
3389+ automatically receive per-column null sentinel values when needed. Sentinel
3390+ selection follows the current null-policy rules used by CTable schema
3391+ handling.
3392+
3393+ blosc2_batch_size : int or None, optional
3394+ Number of items written to Blosc2 containers per internal write batch.
3395+ Passed through to :meth:`CTable.from_arrow`.
3396+
3397+ blosc2_items_per_block : int or None, optional
3398+ Target number of items per internal Blosc2 block. Passed through to
3399+ :meth:`CTable.from_arrow`.
3400+
3401+ **kwargs
3402+ Additional keyword arguments forwarded to ``pyarrow.parquet.ParquetFile``.
3403+ Use these for Parquet-reader-specific options supported by PyArrow.
3404+
3405+ Returns
3406+ -------
3407+ CTable
3408+ A new :class:`CTable` populated from the Parquet file. The table contains
3409+ all selected columns and all rows from the file. If ``urlpath`` is
3410+ provided, the returned table is disk-backed; otherwise it is in-memory.
3411+
3412+ Raises
3413+ ------
3414+ ImportError
3415+ If :mod:`pyarrow` is not installed.
3416+ ValueError
3417+ If ``batch_size`` is not greater than 0.
3418+ ValueError
3419+ If ``columns`` contains duplicate names.
3420+ Exception
3421+ Any exception raised by :mod:`pyarrow` while opening or reading the Parquet
3422+ file, or by :meth:`CTable.from_arrow` while converting Arrow data into a
3423+ CTable.
3424+
3425+ Examples
3426+ --------
3427+ Load an entire Parquet file into an in-memory table:
3428+
3429+ >>> import blosc2
3430+ >>> t = blosc2.CTable.from_parquet("data.parquet")
3431+
3432+ Load only a subset of columns:
3433+
3434+ >>> t = blosc2.CTable.from_parquet(
3435+ ... "data.parquet",
3436+ ... columns=["user_id", "amount", "country"],
3437+ ... )
3438+
3439+ Create a disk-backed table while reading in batches:
3440+
3441+ >>> t = blosc2.CTable.from_parquet(
3442+ ... "data.parquet",
3443+ ... batch_size=50_000,
3444+ ... urlpath="data.ctable",
3445+ ... )
3446+
3447+ Pass additional options through to PyArrow's Parquet reader:
3448+
3449+ >>> t = blosc2.CTable.from_parquet(
3450+ ... "data.parquet",
3451+ ... memory_map=True,
3452+ ... )
3453+ """
33423454 pq = cls ._require_pyarrow_parquet ("from_parquet()" )
33433455 pa = cls ._require_pyarrow ("from_parquet()" )
33443456 cls ._validate_arrow_batch_size (batch_size )
0 commit comments