@@ -326,3 +326,88 @@ def convert_str_slice_to_int_slice(
326
326
stop = columns .index (str_slice .stop ) + 1 if str_slice .stop is not None else None
327
327
step = str_slice .step
328
328
return (start , stop , step )
329
+
330
+
331
+ # Regex for date, time, separator and timezone components
332
+ DATE_RE = r"^(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})?"
333
+ SEP_RE = r"(?P<sep>\s|T)?"
334
+ TIME_RE = r"(?P<time>\d{2}:\d{2}:\d{2})?" # \s*(?P<period>[AP]M)?)?
335
+ TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})?$" # Matches 'Z', '+02:00', '+0200', '+02', etc.
336
+ FULL_RE = DATE_RE + SEP_RE + TIME_RE + TZ_RE
337
+
338
+ # Separate regexes for different date formats
339
+ YMD_RE = r"^(?P<year>(?:[12][0-9])?[0-9]{2})(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])$"
340
+ DMY_RE = r"^(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
341
+ MDY_RE = r"^(?P<month>0[1-9]|1[0-2])(?P<sep1>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
342
+
343
+ DATE_FORMATS = (
344
+ (YMD_RE , "%Y-%m-%d" ),
345
+ (DMY_RE , "%d-%m-%Y" ),
346
+ (MDY_RE , "%m-%d-%Y" ),
347
+ )
348
+
349
+
350
+ def parse_datetime_format (arr : pa .StringArray ) -> str :
351
+ """Try to infer datetime format from StringArray."""
352
+ import pyarrow as pa # ignore-banned-import
353
+ import pyarrow .compute as pc # ignore-banned-import
354
+
355
+ matches = pa .concat_arrays ( # converts from ChunkedArray to StructArray
356
+ pc .extract_regex (arr , pattern = FULL_RE ).chunks
357
+ )
358
+
359
+ if not pc .all (matches .is_valid ()).as_py ():
360
+ msg = (
361
+ "Unable to infer datetime format, provided format is not supported. "
362
+ "Please report a bug to https://github.com/narwhals-dev/narwhals/issues"
363
+ )
364
+ raise NotImplementedError (msg )
365
+
366
+ dates = matches .field ("date" )
367
+ separators = matches .field ("sep" )
368
+ times = matches .field ("time" )
369
+ tz = matches .field ("tz" )
370
+
371
+ # separators and time zones must be unique
372
+ if pc .count (pc .unique (separators )).as_py () > 1 :
373
+ msg = "Found multiple separator values while inferring datetime format."
374
+ raise ValueError (msg )
375
+
376
+ if pc .count (pc .unique (tz )).as_py () > 1 :
377
+ msg = "Found multiple timezone values while inferring datetime format."
378
+ raise ValueError (msg )
379
+
380
+ date_value = _parse_date_format (dates )
381
+ time_value = _parse_time_format (times )
382
+
383
+ sep_value = separators [0 ].as_py ()
384
+ tz_value = "%z" if tz [0 ].as_py () else ""
385
+
386
+ return f"{ date_value } { sep_value } { time_value } { tz_value } "
387
+
388
+
389
+ def _parse_date_format (arr : pa .Array ) -> str :
390
+ import pyarrow .compute as pc # ignore-banned-import
391
+
392
+ for date_rgx , date_fmt in DATE_FORMATS :
393
+ matches = pc .extract_regex (arr , pattern = date_rgx )
394
+ if (
395
+ pc .all (matches .is_valid ()).as_py ()
396
+ and pc .count (pc .unique (sep1 := matches .field ("sep1" ))).as_py () == 1
397
+ and pc .count (pc .unique (sep2 := matches .field ("sep2" ))).as_py () == 1
398
+ and (date_sep_value := sep1 [0 ].as_py ()) == sep2 [0 ].as_py ()
399
+ ):
400
+ return date_fmt .replace ("-" , date_sep_value )
401
+
402
+ msg = (
403
+ "Unable to infer datetime format. "
404
+ "Please report a bug to https://github.com/narwhals-dev/narwhals/issues"
405
+ )
406
+ raise ValueError (msg )
407
+
408
+
409
+ def _parse_time_format (arr : pa .Array ) -> str :
410
+ import pyarrow .compute as pc # ignore-banned-import
411
+
412
+ matches = pc .extract_regex (arr , pattern = TIME_RE )
413
+ return "%H:%M:%S" if pc .all (matches .is_valid ()).as_py () else ""
0 commit comments