11
11
import numpy as np
12
12
import pytest
13
13
14
- from pandas._config import using_string_dtype
15
-
16
14
import pandas.util._test_decorators as td
17
15
18
16
import pandas as pd
@@ -347,9 +345,8 @@ def test_write_dta6(self, datapath):
347
345
check_index_type=False,
348
346
)
349
347
350
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
351
348
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
352
- def test_read_write_dta10(self, version):
349
+ def test_read_write_dta10(self, version, using_infer_string ):
353
350
original = DataFrame(
354
351
data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]],
355
352
columns=["string", "object", "integer", "floating", "datetime"],
@@ -362,12 +359,17 @@ def test_read_write_dta10(self, version):
362
359
with tm.ensure_clean() as path:
363
360
original.to_stata(path, convert_dates={"datetime": "tc"}, version=version)
364
361
written_and_read_again = self.read_dta(path)
365
- # original.index is np.int32, read index is np.int64
366
- tm.assert_frame_equal(
367
- written_and_read_again.set_index("index"),
368
- original,
369
- check_index_type=False,
370
- )
362
+
363
+ expected = original.copy()
364
+ if using_infer_string:
365
+ expected["object"] = expected["object"].astype("str")
366
+
367
+ # original.index is np.int32, read index is np.int64
368
+ tm.assert_frame_equal(
369
+ written_and_read_again.set_index("index"),
370
+ expected,
371
+ check_index_type=False,
372
+ )
371
373
372
374
def test_stata_doc_examples(self):
373
375
with tm.ensure_clean() as path:
@@ -1153,7 +1155,6 @@ def test_categorical_ordering(self, file, datapath):
1153
1155
assert parsed[col].cat.ordered
1154
1156
assert not parsed_unordered[col].cat.ordered
1155
1157
1156
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
1157
1158
@pytest.mark.filterwarnings("ignore::UserWarning")
1158
1159
@pytest.mark.parametrize(
1159
1160
"file",
@@ -1215,6 +1216,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame:
1215
1216
if cat.categories.dtype == object:
1216
1217
categories = pd.Index._with_infer(cat.categories._values)
1217
1218
cat = cat.set_categories(categories)
1219
+ elif cat.categories.dtype == "string" and len(cat.categories) == 0:
1220
+ # if the read categories are empty, it comes back as object dtype
1221
+ categories = cat.categories.astype(object)
1222
+ cat = cat.set_categories(categories)
1218
1223
from_frame[col] = cat
1219
1224
return from_frame
1220
1225
@@ -1244,7 +1249,6 @@ def test_iterator(self, datapath):
1244
1249
from_chunks = pd.concat(itr)
1245
1250
tm.assert_frame_equal(parsed, from_chunks)
1246
1251
1247
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
1248
1252
@pytest.mark.filterwarnings("ignore::UserWarning")
1249
1253
@pytest.mark.parametrize(
1250
1254
"file",
@@ -1548,12 +1552,11 @@ def test_inf(self, infval):
1548
1552
with tm.ensure_clean() as path:
1549
1553
df.to_stata(path)
1550
1554
1551
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1552
1555
def test_path_pathlib(self):
1553
1556
df = DataFrame(
1554
1557
1.1 * np.arange(120).reshape((30, 4)),
1555
- columns=pd.Index(list("ABCD"), dtype=object ),
1556
- index=pd.Index([f"i-{i}" for i in range(30)], dtype=object ),
1558
+ columns=pd.Index(list("ABCD")),
1559
+ index=pd.Index([f"i-{i}" for i in range(30)]),
1557
1560
)
1558
1561
df.index.name = "index"
1559
1562
reader = lambda x: read_stata(x).set_index("index")
@@ -1584,13 +1587,12 @@ def test_value_labels_iterator(self, write_index):
1584
1587
value_labels = dta_iter.value_labels()
1585
1588
assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}}
1586
1589
1587
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1588
1590
def test_set_index(self):
1589
1591
# GH 17328
1590
1592
df = DataFrame(
1591
1593
1.1 * np.arange(120).reshape((30, 4)),
1592
- columns=pd.Index(list("ABCD"), dtype=object ),
1593
- index=pd.Index([f"i-{i}" for i in range(30)], dtype=object ),
1594
+ columns=pd.Index(list("ABCD")),
1595
+ index=pd.Index([f"i-{i}" for i in range(30)]),
1594
1596
)
1595
1597
df.index.name = "index"
1596
1598
with tm.ensure_clean() as path:
@@ -1618,8 +1620,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
1618
1620
formatted = df.loc[0, column + "_fmt"]
1619
1621
assert unformatted == formatted
1620
1622
1621
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1622
- def test_writer_117(self):
1623
+ def test_writer_117(self, using_infer_string):
1623
1624
original = DataFrame(
1624
1625
data=[
1625
1626
[
@@ -1682,13 +1683,17 @@ def test_writer_117(self):
1682
1683
version=117,
1683
1684
)
1684
1685
written_and_read_again = self.read_dta(path)
1685
- # original.index is np.int32, read index is np.int64
1686
- tm.assert_frame_equal(
1687
- written_and_read_again.set_index("index"),
1688
- original,
1689
- check_index_type=False,
1690
- )
1691
- tm.assert_frame_equal(original, copy)
1686
+
1687
+ expected = original[:]
1688
+ if using_infer_string:
1689
+ # object dtype (with only strings/None) comes back as string dtype
1690
+ expected["object"] = expected["object"].astype("str")
1691
+
1692
+ tm.assert_frame_equal(
1693
+ written_and_read_again.set_index("index"),
1694
+ expected,
1695
+ )
1696
+ tm.assert_frame_equal(original, copy)
1692
1697
1693
1698
def test_convert_strl_name_swap(self):
1694
1699
original = DataFrame(
@@ -1725,15 +1730,14 @@ def test_invalid_date_conversion(self):
1725
1730
with pytest.raises(ValueError, match=msg):
1726
1731
original.to_stata(path, convert_dates={"wrong_name": "tc"})
1727
1732
1728
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1729
1733
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
1730
1734
def test_nonfile_writing(self, version):
1731
1735
# GH 21041
1732
1736
bio = io.BytesIO()
1733
1737
df = DataFrame(
1734
1738
1.1 * np.arange(120).reshape((30, 4)),
1735
- columns=pd.Index(list("ABCD"), dtype=object ),
1736
- index=pd.Index([f"i-{i}" for i in range(30)], dtype=object ),
1739
+ columns=pd.Index(list("ABCD")),
1740
+ index=pd.Index([f"i-{i}" for i in range(30)]),
1737
1741
)
1738
1742
df.index.name = "index"
1739
1743
with tm.ensure_clean() as path:
@@ -1744,13 +1748,12 @@ def test_nonfile_writing(self, version):
1744
1748
reread = read_stata(path, index_col="index")
1745
1749
tm.assert_frame_equal(df, reread)
1746
1750
1747
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1748
1751
def test_gzip_writing(self):
1749
1752
# writing version 117 requires seek and cannot be used with gzip
1750
1753
df = DataFrame(
1751
1754
1.1 * np.arange(120).reshape((30, 4)),
1752
- columns=pd.Index(list("ABCD"), dtype=object ),
1753
- index=pd.Index([f"i-{i}" for i in range(30)], dtype=object ),
1755
+ columns=pd.Index(list("ABCD")),
1756
+ index=pd.Index([f"i-{i}" for i in range(30)]),
1754
1757
)
1755
1758
df.index.name = "index"
1756
1759
with tm.ensure_clean() as path:
@@ -1777,8 +1780,7 @@ def test_unicode_dta_118(self, datapath):
1777
1780
1778
1781
tm.assert_frame_equal(unicode_df, expected)
1779
1782
1780
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1781
- def test_mixed_string_strl(self):
1783
+ def test_mixed_string_strl(self, using_infer_string):
1782
1784
# GH 23633
1783
1785
output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}]
1784
1786
output = DataFrame(output)
@@ -1796,7 +1798,10 @@ def test_mixed_string_strl(self):
1796
1798
path, write_index=False, convert_strl=["mixed"], version=117
1797
1799
)
1798
1800
reread = read_stata(path)
1799
- expected = output.fillna("")
1801
+ expected = output.copy()
1802
+ if using_infer_string:
1803
+ expected["mixed"] = expected["mixed"].astype("str")
1804
+ expected = expected.fillna("")
1800
1805
tm.assert_frame_equal(reread, expected)
1801
1806
1802
1807
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -1875,7 +1880,7 @@ def test_stata_119(self, datapath):
1875
1880
reader._ensure_open()
1876
1881
assert reader._nvar == 32999
1877
1882
1878
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) ")
1883
+ @pytest.mark.filterwarnings("ignore:Downcasting behavior:FutureWarning ")
1879
1884
@pytest.mark.parametrize("version", [118, 119, None])
1880
1885
def test_utf8_writer(self, version):
1881
1886
cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
@@ -2143,14 +2148,13 @@ def test_iterator_errors(datapath, chunksize):
2143
2148
pass
2144
2149
2145
2150
2146
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
2147
2151
def test_iterator_value_labels():
2148
2152
# GH 31544
2149
2153
values = ["c_label", "b_label"] + ["a_label"] * 500
2150
2154
df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
2151
2155
with tm.ensure_clean() as path:
2152
2156
df.to_stata(path, write_index=False)
2153
- expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object" )
2157
+ expected = pd.Index(["a_label", "b_label", "c_label"])
2154
2158
with read_stata(path, chunksize=100) as reader:
2155
2159
for j, chunk in enumerate(reader):
2156
2160
for i in range(2):
0 commit comments