From 587e5d76f810c5314c81a79acdaeb8b15a7ba385 Mon Sep 17 00:00:00 2001 From: Jesse Date: Mon, 14 Apr 2025 09:02:27 +0000 Subject: [PATCH 1/3] ENH: Update DataFrame.to_stata to handle pd.NA and None values in strL columns --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/io/stata.py | 4 ++-- pandas/tests/io/test_stata.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 230332319e0ac..5770f02c409d9 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -150,6 +150,7 @@ MultiIndex I/O ^^^ - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) +- :meth:`DataFrame.to_stata` no longer throws a ``TypeError('encoding without a string argument')`` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) - Period diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 34d95fb59a21c..49080144abee5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3196,8 +3196,8 @@ def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: for o, (idx, row) in enumerate(selected.iterrows()): for j, (col, v) in enumerate(col_index): val = row[col] - # Allow columns with mixed str and None (GH 23633) - val = "" if val is None else val + # Allow columns with mixed str and None or pd.NA (GH 23633) + val = "" if (val is None) or isna(val) else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9288b98d79fbe..e73de78847c8f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2587,3 +2587,17 @@ def test_many_strl(temp_file, version): lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))] value_labels = {"col": {i: lbls[i] for i in range(n)}} df.to_stata(temp_file, value_labels=value_labels, version=version) + + +@pytest.mark.parametrize("version", [117, 118, 119, None]) +def test_strl_missings(temp_file, version): + # GH 23633 + # Check that strl supports None and pd.NA + df = DataFrame( + [ + {"str1": "string" * 500, "number": 0}, + {"str1": None, "number": 1}, + {"str1": pd.NA, "number": 1}, + ] + ) + df.to_stata(temp_file, version=version) From 92ed281fa43b3490f4f30cd7fd9a79dfd1da5f24 Mon Sep 17 00:00:00 2001 From: Danferno Date: Tue, 22 Apr 2025 10:10:58 +0200 Subject: [PATCH 2/3] Update pandas/io/stata.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 49080144abee5..cd290710ddbaa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3197,7 +3197,7 @@ def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: for j, (col, v) in enumerate(col_index): val = row[col] # Allow columns with mixed str and None or pd.NA (GH 23633) - val = "" if (val is None) or isna(val) else val + val = "" if isna(val) else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers From 11266a47cd71764c8c6a30737642d0f7faa83484 Mon Sep 17 00:00:00 2001 From: Jesse Date: Tue, 22 Apr 2025 08:17:03 +0000 Subject: [PATCH 3/3] Moved changelog msg to 3.0.0 and adapted phrasing --- doc/source/whatsnew/v2.3.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 5770f02c409d9..230332319e0ac 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -150,7 +150,6 @@ MultiIndex I/O ^^^ - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) -- :meth:`DataFrame.to_stata` no longer throws a ``TypeError('encoding without a string argument')`` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) - Period diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 184ca581902ee..ac7b489721cd9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -733,6 +733,7 @@ I/O - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) +- Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)