3131import argparse
3232import base64
3333import contextlib
34+ import cProfile
3435import gc
36+ import io
3537import os
38+ import pstats
3639import shutil
3740import sys
3841import time
@@ -142,7 +145,19 @@ def build_parser() -> argparse.ArgumentParser:
142145 default = None ,
143146 help = "Output path. Defaults depend on the mode and input path." ,
144147 )
145- parser .add_argument ("--batch-size" , type = int , default = DEFAULT_BATCH_SIZE )
148+ parser .add_argument ("--parquet-batch-size" , type = int , default = DEFAULT_BATCH_SIZE )
149+ parser .add_argument (
150+ "--batch-size" ,
151+ dest = "parquet_batch_size" ,
152+ type = int ,
153+ help = argparse .SUPPRESS ,
154+ )
155+ parser .add_argument (
156+ "--blosc2-batch-size" ,
157+ type = int ,
158+ default = DEFAULT_BATCH_SIZE ,
159+ help = "Rows grouped into each persisted BatchArray batch for imported Blosc2 varlen/list columns." ,
160+ )
146161 parser .add_argument ("--codec" , type = str , default = "ZSTD" , choices = [c .name for c in blosc2 .Codec ])
147162 parser .add_argument ("--clevel" , type = int , default = 5 )
148163 parser .add_argument (
@@ -162,6 +177,11 @@ def build_parser() -> argparse.ArgumentParser:
162177 default = 1 ,
163178 help = "Print progress every N batches; the final batch is always reported." ,
164179 )
180+ parser .add_argument (
181+ "--profile" ,
182+ action = "store_true" ,
183+ help = "Run the selected operation under cProfile and print cumulative timing stats." ,
184+ )
165185 parser .add_argument ("--overwrite" , action = "store_true" )
166186 return parser
167187
@@ -327,7 +347,8 @@ def print_import_plan(
327347 print (f" Skipped unsupported: { len (skipped )} " )
328348 for name , entry in skipped .items ():
329349 print (f" - { name } : { entry ['reason' ]} " )
330- print (f"Batch size: { args .batch_size :,} " )
350+ print (f"Parquet batch size: { args .parquet_batch_size :,} " )
351+ print (f"Blosc2 batch size: { args .blosc2_batch_size :,} " )
331352 print (f"Codec / level: { args .codec } / { args .clevel } " )
332353 print ()
333354
@@ -337,7 +358,7 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols):
337358 t0 = time .perf_counter ()
338359 total = pf .metadata .num_rows
339360 for batch_n , raw_batch in enumerate (
340- pf .iter_batches (batch_size = args .batch_size , columns = selected_cols ), start = 1
361+ pf .iter_batches (batch_size = args .parquet_batch_size , columns = selected_cols ), start = 1
341362 ):
342363 report_batch_mem = args .mem_report and batch_n % args .mem_every == 0
343364 if report_batch_mem :
@@ -363,8 +384,10 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols):
363384
364385
365386def import_parquet_to_ctable (args , input_path : Path , output_path : Path ):
366- if args .batch_size <= 0 :
367- raise ValueError ("--batch-size must be positive" )
387+ if args .parquet_batch_size <= 0 :
388+ raise ValueError ("--parquet-batch-size must be positive" )
389+ if args .blosc2_batch_size <= 0 :
390+ raise ValueError ("--blosc2-batch-size must be positive" )
368391 if args .mem_every <= 0 :
369392 raise ValueError ("--mem-every must be positive" )
370393 if args .batch_report_every <= 0 :
@@ -412,6 +435,7 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path):
412435 capacity_hint = pf .metadata .num_rows ,
413436 string_max_length = None ,
414437 auto_null_sentinels = True ,
438+ blosc2_batch_size = args .blosc2_batch_size ,
415439 )
416440 maybe_memory_report (args , "after CTable import" , pa )
417441 store_original_arrow_metadata (ct , parquet_schema , import_schema , conversions )
@@ -451,7 +475,7 @@ def unwrap_singleton_list(pa, arr, arrow_type):
451475def export_ctable_to_parquet (input_path : Path , output_path : Path , * , batch_size : int , overwrite : bool ):
452476 pa , pq = require_pyarrow ()
453477 if batch_size <= 0 :
454- raise ValueError ("--batch-size must be positive" )
478+ raise ValueError ("--parquet- batch-size must be positive" )
455479 prepare_output (output_path , overwrite )
456480 ct = blosc2 .CTable .open (str (input_path ))
457481 original_schema = original_schema_from_ctable (pa , ct )
@@ -549,13 +573,12 @@ def assess_parquet_difference(original_path: Path, roundtrip_path: Path, exporte
549573 print (f" Roundtrip size: { roundtrip_path .stat ().st_size / 1e6 :.1f} MB" )
550574
551575
552- def main (argv : list [str ] | None = None ) -> int :
553- args = build_parser ().parse_args (argv )
576+ def _run_command (args ) -> int :
554577 if args .export :
555578 input_path = args .input_path
556579 output_path = args .output_path or _default_export_output (input_path )
557580 export_ctable_to_parquet (
558- input_path , output_path , batch_size = args .batch_size , overwrite = args .overwrite
581+ input_path , output_path , batch_size = args .parquet_batch_size , overwrite = args .overwrite
559582 )
560583 return 0
561584 if args .roundtrip :
@@ -564,7 +587,7 @@ def main(argv: list[str] | None = None) -> int:
564587 roundtrip_path = _default_roundtrip_output (input_path )
565588 selected = import_parquet_to_ctable (args , input_path , b2_path )
566589 exported = export_ctable_to_parquet (
567- b2_path , roundtrip_path , batch_size = args .batch_size , overwrite = True
590+ b2_path , roundtrip_path , batch_size = args .parquet_batch_size , overwrite = True
568591 )
569592 assess_parquet_difference (input_path , roundtrip_path , exported or selected )
570593 return 0
@@ -574,5 +597,41 @@ def main(argv: list[str] | None = None) -> int:
574597 return 0
575598
576599
600+ def _run_profiled (args ) -> int :
601+ profiler = cProfile .Profile ()
602+ profiler .enable ()
603+ try :
604+ return _run_command (args )
605+ finally :
606+ profiler .disable ()
607+ stream = io .StringIO ()
608+ stats = pstats .Stats (profiler , stream = stream ).sort_stats ("cumulative" )
609+ stats .print_stats (50 )
610+ print ("\n [cProfile] Top cumulative-time functions\n " )
611+ print (stream .getvalue ().rstrip ())
612+
613+
614+ def _option_present (argv : list [str ], option : str ) -> bool :
615+ return any (arg == option or arg .startswith (option + "=" ) for arg in argv )
616+
617+
618+ def main (argv : list [str ] | None = None ) -> int :
619+ argv = sys .argv [1 :] if argv is None else list (argv )
620+ args = build_parser ().parse_args (argv )
621+
622+ parquet_specified = _option_present (argv , "--parquet-batch-size" ) or _option_present (
623+ argv , "--batch-size"
624+ )
625+ blosc2_specified = _option_present (argv , "--blosc2-batch-size" )
626+ if parquet_specified and not blosc2_specified :
627+ args .blosc2_batch_size = args .parquet_batch_size
628+ elif blosc2_specified and not parquet_specified :
629+ args .parquet_batch_size = args .blosc2_batch_size
630+
631+ if args .profile :
632+ return _run_profiled (args )
633+ return _run_command (args )
634+
635+
577636if __name__ == "__main__" :
578637 raise SystemExit (main ())
0 commit comments