BUG: Fix factorize to ensure proper use of null_encoding parameter

asharmalik19 · asharmalik19 · commit b80f33cda63e · 2025-01-29T14:22:01.000-05:00
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1207,10 +1207,9 @@ def factorize(
             # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
             data = data.cast(pa.int64())
 
-        if pa.types.is_dictionary(data.type):
-            encoded = data
-        else:
-            encoded = data.dictionary_encode(null_encoding=null_encoding)
+        if pa.types.is_dictionary(data.type) and null_encoding == "encode":
+            data = data.cast(data.type.value_type)
+        encoded = data.dictionary_encode(null_encoding=null_encoding)
         if encoded.length() == 0:
             indices = np.array([], dtype=np.intp)
             uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -3330,6 +3330,19 @@ def test_factorize_chunked_dictionary():
     tm.assert_index_equal(res_uniques, exp_uniques)
 
 
+def test_factorize_dictionary_with_na():
+    # Test that factorize properly handles NA values in dictionary arrays
+    arr = pd.array(
+        ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+    )
+    # Test with use_na_sentinel=False
+    indices, uniques = arr.factorize(use_na_sentinel=False)
+    expected_indices = np.array([0, 1], dtype=np.intp)
+    expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_numpy_array_equal(indices, expected_indices)
+    tm.assert_extension_array_equal(uniques, expected_uniques)
+
+
 def test_dictionary_astype_categorical():
     # GH#56672
     arrs = [