Skip to content

Commit 284e359

Browse files
jorisvandenbosscheWillAyd
authored andcommitted
Backport PR pandas-dev#60321: TST (string dtype): resolve all xfails in IO parser tests
(cherry picked from commit ee3c18f)
1 parent aa8adfa commit 284e359

11 files changed

+47
-59
lines changed

pandas/tests/io/parser/common/test_chunksize.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas._libs import parsers as libparsers
1311
from pandas.errors import DtypeWarning
1412

@@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
230228
assert result.a.dtype == float
231229

232230

233-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
234-
def test_warn_if_chunks_have_mismatched_type(all_parsers):
231+
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
235232
warning_type = None
236233
parser = all_parsers
237234
size = 10000
@@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
259256
"Specify dtype option on import or set low_memory=False.",
260257
buf,
261258
)
262-
263-
assert df.a.dtype == object
259+
if parser.engine == "c" and parser.low_memory:
260+
assert df.a.dtype == object
261+
elif using_infer_string:
262+
assert df.a.dtype == "str"
263+
else:
264+
assert df.a.dtype == object
264265

265266

266267
@pytest.mark.parametrize("iterator", [True, False])

pandas/tests/io/parser/common/test_file_buffer_url.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
import numpy as np
1515
import pytest
1616

17-
from pandas._config import using_string_dtype
18-
1917
from pandas.errors import (
2018
EmptyDataError,
2119
ParserError,
@@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
6967
pytest.skip("Failing on: " + " ".join(platform.uname()))
7068

7169

72-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
7370
@xfail_pyarrow # AssertionError: DataFrame.index are different
7471
def test_path_path_lib(all_parsers):
7572
parser = all_parsers
7673
df = DataFrame(
7774
1.1 * np.arange(120).reshape((30, 4)),
78-
columns=Index(list("ABCD"), dtype=object),
79-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
75+
columns=Index(list("ABCD")),
76+
index=Index([f"i-{i}" for i in range(30)]),
8077
)
8178
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
8279
tm.assert_frame_equal(df, result)

pandas/tests/io/parser/common/test_index.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88

99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas import (
1412
DataFrame,
1513
Index,
@@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
8785
tm.assert_frame_equal(result, expected)
8886

8987

90-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
9188
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
92-
def test_multi_index_no_level_names(all_parsers, index_col):
89+
def test_multi_index_no_level_names(
90+
request, all_parsers, index_col, using_infer_string
91+
):
92+
if using_infer_string and all_parsers.engine == "pyarrow":
93+
# result should have string columns instead of object dtype
94+
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
9395
data = """index1,index2,A,B,C,D
9496
foo,one,2,3,4,5
9597
foo,two,7,8,9,10

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas.errors import ParserWarning
1412

1513
import pandas as pd
@@ -54,7 +52,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
5452
tm.assert_frame_equal(result, expected)
5553

5654

57-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
5855
@pytest.mark.usefixtures("pyarrow_xfail")
5956
def test_dtype_per_column(all_parsers):
6057
parser = all_parsers
@@ -68,7 +65,6 @@ def test_dtype_per_column(all_parsers):
6865
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
6966
)
7067
expected["one"] = expected["one"].astype(np.float64)
71-
expected["two"] = expected["two"].astype(object)
7268

7369
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
7470
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/test_c_parser_only.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
import numpy as np
1818
import pytest
1919

20-
from pandas._config import using_string_dtype
21-
2220
from pandas.compat.numpy import np_version_gte1p24
2321
from pandas.errors import (
2422
ParserError,
@@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
185183
assert max(precise_errors) <= max(normal_errors)
186184

187185

188-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
189-
def test_usecols_dtypes(c_parser_only):
186+
def test_usecols_dtypes(c_parser_only, using_infer_string):
190187
parser = c_parser_only
191188
data = """\
192189
1,2,3
@@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
211208
dtype={"b": int, "c": float},
212209
)
213210

214-
assert (result.dtypes == [object, int, float]).all()
215-
assert (result2.dtypes == [object, float]).all()
211+
if using_infer_string:
212+
assert (result.dtypes == ["string", int, float]).all()
213+
assert (result2.dtypes == ["string", float]).all()
214+
else:
215+
assert (result.dtypes == [object, int, float]).all()
216+
assert (result2.dtypes == [object, float]).all()
216217

217218

218219
def test_disable_bool_parsing(c_parser_only):

pandas/tests/io/parser/test_converters.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
import pandas as pd
1412
from pandas import (
1513
DataFrame,
@@ -186,7 +184,6 @@ def convert_score(x):
186184
tm.assert_frame_equal(results[0], results[1])
187185

188186

189-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
190187
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
191188
def test_converter_index_col_bug(all_parsers, conv_f):
192189
# see gh-1835 , GH#40589
@@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
205202
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
206203
)
207204

208-
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
205+
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
209206
tm.assert_frame_equal(rs, xp)
210207

211208

pandas/tests/io/parser/test_index_col.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas import (
1412
DataFrame,
1513
Index,
@@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
344342
tm.assert_frame_equal(result, expected, check_index_type=False)
345343

346344

347-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
348345
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
349346
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
350347
# GH#9435
@@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
355352
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
356353
)
357354
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
358-
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
355+
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
359356
tm.assert_frame_equal(result, expected)
360357

361358

pandas/tests/io/parser/test_mangle_dupes.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77

88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
12-
from pandas import DataFrame
10+
from pandas import (
11+
DataFrame,
12+
Index,
13+
)
1314
import pandas._testing as tm
1415

1516
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
120121
parser.read_csv(StringIO(data), names=names)
121122

122123

123-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
124124
@xfail_pyarrow # AssertionError: DataFrame.columns are different
125125
def test_mangled_unnamed_placeholders(all_parsers):
126126
# xref gh-13017
@@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
132132

133133
# This test recursively updates `df`.
134134
for i in range(3):
135-
expected = DataFrame()
135+
expected = DataFrame(columns=Index([], dtype="str"))
136136

137137
for j in range(i + 1):
138138
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)

pandas/tests/io/parser/test_na_values.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas._libs.parsers import STR_NA_VALUES
1311

1412
from pandas import (
@@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
260258
tm.assert_frame_equal(result, expected)
261259

262260

263-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
264261
@pytest.mark.parametrize(
265262
"kwargs,expected",
266263
[
@@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
306303
),
307304
],
308305
)
309-
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
306+
def test_na_values_keep_default(
307+
all_parsers, kwargs, expected, request, using_infer_string
308+
):
310309
data = """\
311310
A,B,C
312311
a,1,one
@@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
324323
with pytest.raises(ValueError, match=msg):
325324
parser.read_csv(StringIO(data), **kwargs)
326325
return
327-
mark = pytest.mark.xfail()
328-
request.applymarker(mark)
326+
if not using_infer_string or "na_values" in kwargs:
327+
mark = pytest.mark.xfail()
328+
request.applymarker(mark)
329329

330330
result = parser.read_csv(StringIO(data), **kwargs)
331331
tm.assert_frame_equal(result, expected)
@@ -435,23 +435,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
435435
tm.assert_frame_equal(result, expected)
436436

437437

438-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
439-
@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
440438
@pytest.mark.parametrize(
441439
"na_filter,row_data",
442440
[
443441
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
444442
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
445443
],
446444
)
447-
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
445+
def test_na_values_na_filter_override(
446+
request, all_parsers, na_filter, row_data, using_infer_string
447+
):
448+
parser = all_parsers
449+
if parser.engine == "pyarrow":
450+
# mismatched dtypes in both cases, FutureWarning in the True case
451+
if not (using_infer_string and na_filter):
452+
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
453+
request.applymarker(mark)
448454
data = """\
449455
A,B
450456
1,A
451457
nan,B
452458
3,C
453459
"""
454-
parser = all_parsers
455460
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
456461

457462
expected = DataFrame(row_data, columns=["A", "B"])

pandas/tests/io/parser/test_parse_dates.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
import pytest
1717
import pytz
1818

19-
from pandas._config import using_string_dtype
20-
2119
from pandas._libs.tslibs import parsing
2220

2321
import pandas as pd
@@ -1799,15 +1797,14 @@ def test_parse_timezone(all_parsers):
17991797
tm.assert_frame_equal(result, expected)
18001798

18011799

1802-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
18031800
@skip_pyarrow # pandas.errors.ParserError: CSV parse error
18041801
@pytest.mark.parametrize(
18051802
"date_string",
18061803
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
18071804
)
18081805
def test_invalid_parse_delimited_date(all_parsers, date_string):
18091806
parser = all_parsers
1810-
expected = DataFrame({0: [date_string]}, dtype="object")
1807+
expected = DataFrame({0: [date_string]}, dtype="str")
18111808
result = parser.read_csv(
18121809
StringIO(date_string),
18131810
header=None,
@@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers):
20542051
tm.assert_frame_equal(result, expected)
20552052

20562053

2057-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
20582054
def test_dayfirst_warnings():
20592055
# GH 12585
20602056

@@ -2087,7 +2083,7 @@ def test_dayfirst_warnings():
20872083

20882084
# first in DD/MM/YYYY, second in MM/DD/YYYY
20892085
input = "date\n31/12/2014\n03/30/2011"
2090-
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
2086+
expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date")
20912087

20922088
# A. use dayfirst=True
20932089
res5 = read_csv(
@@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers):
22042200
tm.assert_frame_equal(result, expected)
22052201

22062202

2207-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
22082203
def test_parse_dot_separated_dates(all_parsers):
22092204
# https://github.com/pandas-dev/pandas/issues/2586
22102205
parser = all_parsers
@@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers):
22142209
if parser.engine == "pyarrow":
22152210
expected_index = Index(
22162211
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
2217-
dtype="object",
2212+
dtype="str",
22182213
name="a",
22192214
)
22202215
warn = None

pandas/tests/io/parser/test_upcast.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas._libs.parsers import (
75
_maybe_upcast,
86
na_values,
@@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan():
8684
tm.assert_extension_array_equal(result, expected)
8785

8886

89-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
9087
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
9188
def test_maybe_upcast_object(val, string_storage):
9289
# GH#36712

0 commit comments

Comments
 (0)