Backport PR pandas-dev#60321: TST (string dtype): resolve all xfails in IO parser tests

jorisvandenbossche · WillAyd · commit 284e359bfd8d · 2024-11-15T12:54:29.000-05:00
(cherry picked from commit ee3c18f)
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import parsers as libparsers
 from pandas.errors import DtypeWarning
 
@@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
     assert result.a.dtype == float
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_warn_if_chunks_have_mismatched_type(all_parsers):
+def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
     warning_type = None
     parser = all_parsers
     size = 10000
@@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
             "Specify dtype option on import or set low_memory=False.",
             buf,
         )
-
-    assert df.a.dtype == object
+    if parser.engine == "c" and parser.low_memory:
+        assert df.a.dtype == object
+    elif using_infer_string:
+        assert df.a.dtype == "str"
+    else:
+        assert df.a.dtype == object
 
 
 @pytest.mark.parametrize("iterator", [True, False])
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # AssertionError: DataFrame.index are different
 def test_path_path_lib(all_parsers):
     parser = all_parsers
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
     tm.assert_frame_equal(df, result)
diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -8,8 +8,6 @@
 
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
-def test_multi_index_no_level_names(all_parsers, index_col):
+def test_multi_index_no_level_names(
+    request, all_parsers, index_col, using_infer_string
+):
+    if using_infer_string and all_parsers.engine == "pyarrow":
+        # result should have string columns instead of object dtype
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     data = """index1,index2,A,B,C,D
 foo,one,2,3,4,5
 foo,two,7,8,9,10
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import ParserWarning
 
 import pandas as pd
@@ -54,7 +52,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
         tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
@@ -68,7 +65,6 @@ def test_dtype_per_column(all_parsers):
         [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
     )
     expected["one"] = expected["one"].astype(np.float64)
-    expected["two"] = expected["two"].astype(object)
 
     result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -17,8 +17,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.numpy import np_version_gte1p24
 from pandas.errors import (
     ParserError,
@@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
     assert max(precise_errors) <= max(normal_errors)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_usecols_dtypes(c_parser_only):
+def test_usecols_dtypes(c_parser_only, using_infer_string):
     parser = c_parser_only
     data = """\
 1,2,3
@@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
         dtype={"b": int, "c": float},
     )
 
-    assert (result.dtypes == [object, int, float]).all()
-    assert (result2.dtypes == [object, float]).all()
+    if using_infer_string:
+        assert (result.dtypes == ["string", int, float]).all()
+        assert (result2.dtypes == ["string", float]).all()
+    else:
+        assert (result.dtypes == [object, int, float]).all()
+        assert (result2.dtypes == [object, float]).all()
 
 
 def test_disable_bool_parsing(c_parser_only):
diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -186,7 +184,6 @@ def convert_score(x):
         tm.assert_frame_equal(results[0], results[1])
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("conv_f", [lambda x: x, str])
 def test_converter_index_col_bug(all_parsers, conv_f):
     # see gh-1835 , GH#40589
@@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
         StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
     )
 
-    xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
+    xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
     tm.assert_frame_equal(rs, xp)
 
 
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
     tm.assert_frame_equal(result, expected, check_index_type=False)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
 def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
     # GH#9435
@@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
             pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
         )
     result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
-    expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
+    expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -7,9 +7,10 @@
 
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    Index,
+)
 import pandas._testing as tm
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # AssertionError: DataFrame.columns are different
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
@@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
 
     # This test recursively updates `df`.
     for i in range(3):
-        expected = DataFrame()
+        expected = DataFrame(columns=Index([], dtype="str"))
 
         for j in range(i + 1):
             col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.parsers import STR_NA_VALUES
 
 from pandas import (
@@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
         ),
     ],
 )
-def test_na_values_keep_default(all_parsers, kwargs, expected, request):
+def test_na_values_keep_default(
+    all_parsers, kwargs, expected, request, using_infer_string
+):
     data = """\
 A,B,C
 a,1,one
@@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
             with pytest.raises(ValueError, match=msg):
                 parser.read_csv(StringIO(data), **kwargs)
             return
-        mark = pytest.mark.xfail()
-        request.applymarker(mark)
+        if not using_infer_string or "na_values" in kwargs:
+            mark = pytest.mark.xfail()
+            request.applymarker(mark)
 
     result = parser.read_csv(StringIO(data), **kwargs)
     tm.assert_frame_equal(result, expected)
@@ -435,23 +435,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-@xfail_pyarrow  # mismatched dtypes in both cases, FutureWarning in the True case
 @pytest.mark.parametrize(
     "na_filter,row_data",
     [
         (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
         (False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
     ],
 )
-def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
+def test_na_values_na_filter_override(
+    request, all_parsers, na_filter, row_data, using_infer_string
+):
+    parser = all_parsers
+    if parser.engine == "pyarrow":
+        # mismatched dtypes in both cases, FutureWarning in the True case
+        if not (using_infer_string and na_filter):
+            mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
+            request.applymarker(mark)
     data = """\
 A,B
 1,A
 nan,B
 3,C
 """
-    parser = all_parsers
     result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
 
     expected = DataFrame(row_data, columns=["A", "B"])
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -16,8 +16,6 @@
 import pytest
 import pytz
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import parsing
 
 import pandas as pd
@@ -1799,15 +1797,14 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @skip_pyarrow  # pandas.errors.ParserError: CSV parse error
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
 )
 def test_invalid_parse_delimited_date(all_parsers, date_string):
     parser = all_parsers
-    expected = DataFrame({0: [date_string]}, dtype="object")
+    expected = DataFrame({0: [date_string]}, dtype="str")
     result = parser.read_csv(
         StringIO(date_string),
         header=None,
@@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_dayfirst_warnings():
     # GH 12585
 
@@ -2087,7 +2083,7 @@ def test_dayfirst_warnings():
 
     # first in DD/MM/YYYY, second in MM/DD/YYYY
     input = "date\n31/12/2014\n03/30/2011"
-    expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
+    expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date")
 
     # A. use dayfirst=True
     res5 = read_csv(
@@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 def test_parse_dot_separated_dates(all_parsers):
     # https://github.com/pandas-dev/pandas/issues/2586
     parser = all_parsers
@@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers):
     if parser.engine == "pyarrow":
         expected_index = Index(
             ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
-            dtype="object",
+            dtype="str",
             name="a",
         )
         warn = None
diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.parsers import (
     _maybe_upcast,
     na_values,
@@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan():
     tm.assert_extension_array_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("val", [na_values[np.object_], "c"])
 def test_maybe_upcast_object(val, string_storage):
     # GH#36712