File tree 2 files changed +16
-4
lines changed
2 files changed +16
-4
lines changed Original file line number Diff line number Diff line change @@ -1207,10 +1207,9 @@ def factorize(
1207
1207
# https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1208
1208
data = data .cast (pa .int64 ())
1209
1209
1210
- if pa .types .is_dictionary (data .type ):
1211
- encoded = data
1212
- else :
1213
- encoded = data .dictionary_encode (null_encoding = null_encoding )
1210
+ if pa .types .is_dictionary (data .type ) and null_encoding == "encode" :
1211
+ data = data .cast (data .type .value_type )
1212
+ encoded = data .dictionary_encode (null_encoding = null_encoding )
1214
1213
if encoded .length () == 0 :
1215
1214
indices = np .array ([], dtype = np .intp )
1216
1215
uniques = type (self )(pa .chunked_array ([], type = encoded .type .value_type ))
Original file line number Diff line number Diff line change @@ -3330,6 +3330,19 @@ def test_factorize_chunked_dictionary():
3330
3330
tm .assert_index_equal (res_uniques , exp_uniques )
3331
3331
3332
3332
3333
+ def test_factorize_dictionary_with_na ():
3334
+ # Test that factorize properly handles NA values in dictionary arrays
3335
+ arr = pd .array (
3336
+ ["a1" , pd .NA ], dtype = ArrowDtype (pa .dictionary (pa .int32 (), pa .utf8 ()))
3337
+ )
3338
+ # Test with use_na_sentinel=False
3339
+ indices , uniques = arr .factorize (use_na_sentinel = False )
3340
+ expected_indices = np .array ([0 , 1 ], dtype = np .intp )
3341
+ expected_uniques = pd .array (["a1" , None ], dtype = ArrowDtype (pa .string ()))
3342
+ tm .assert_numpy_array_equal (indices , expected_indices )
3343
+ tm .assert_extension_array_equal (uniques , expected_uniques )
3344
+
3345
+
3333
3346
def test_dictionary_astype_categorical ():
3334
3347
# GH#56672
3335
3348
arrs = [
You can’t perform that action at this time.
0 commit comments