Skip to content

Commit 515698b

Browse files
committed
BUG: Fix factorize to ensure proper use of null_encoding parameter
1 parent 3da2c1c commit 515698b

File tree

2 files changed

+16
-4
lines changed

2 files changed

+16
-4
lines changed

Diff for: pandas/core/arrays/arrow/array.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1207,10 +1207,9 @@ def factorize(
12071207
# https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
12081208
data = data.cast(pa.int64())
12091209

1210-
if pa.types.is_dictionary(data.type):
1211-
encoded = data
1212-
else:
1213-
encoded = data.dictionary_encode(null_encoding=null_encoding)
1210+
if pa.types.is_dictionary(data.type) and null_encoding == "encode":
1211+
data = data.cast(data.type.value_type)
1212+
encoded = data.dictionary_encode(null_encoding=null_encoding)
12141213
if encoded.length() == 0:
12151214
indices = np.array([], dtype=np.intp)
12161215
uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))

Diff for: pandas/tests/extension/test_arrow.py

+13
Original file line numberDiff line numberDiff line change
@@ -3329,6 +3329,19 @@ def test_factorize_chunked_dictionary():
33293329
tm.assert_index_equal(res_uniques, exp_uniques)
33303330

33313331

3332+
def test_factorize_dictionary_with_na():
3333+
# Test that factorize properly handles NA values in dictionary arrays
3334+
arr = pd.array(
3335+
["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
3336+
)
3337+
# Test with use_na_sentinel=False
3338+
indices, uniques = arr.factorize(use_na_sentinel=False)
3339+
expected_indices = np.array([0, 1], dtype=np.intp)
3340+
expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))
3341+
tm.assert_numpy_array_equal(indices, expected_indices)
3342+
tm.assert_extension_array_equal(uniques, expected_uniques)
3343+
3344+
33323345
def test_dictionary_astype_categorical():
33333346
# GH#56672
33343347
arrs = [

0 commit comments

Comments
 (0)