-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: read_csv not converting to float for python engine with decimal sep, usecols and parse_dates #38334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: read_csv not converting to float for python engine with decimal sep, usecols and parse_dates #38334
Changes from 6 commits
8c2e1ca
c1b9a7b
76b91bf
9de5059
ca76832
85a3d22
2958d2a
1d740e0
88bf395
6ad5385
da4b602
384c114
070f67d
70780da
9d6205a
4afa2c8
5bee24a
d96a256
c6a226b
3ece01b
6cca960
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2294,7 +2294,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): | |
|
||
# Get columns in two steps: infer from data, then | ||
# infer column indices from self.usecols if it is specified. | ||
self._col_indices = None | ||
try: | ||
( | ||
self.columns, | ||
|
@@ -2336,6 +2335,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): | |
if self.index_names is None: | ||
self.index_names = index_names | ||
|
||
if not hasattr(self, "_col_indices"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't we always define this way on L2310 or L2297? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do not have acces to self.columns in L2297. Sometimes we set this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you instead define _col_list = None then as the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would work of course, but we would get the mypy problems in again. Could do that nevertheless if this is preferable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes i think its important that this is always defined. you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Optional[List[int]] unfortunately raises the mypy error too. assert is not None does not help, so added the ignores back in |
||
self._col_indices = list(range(len(self.columns))) | ||
|
||
self._validate_parse_dates_presence(self.columns) | ||
if self.parse_dates: | ||
self._no_thousands_columns = self._set_no_thousands_columns() | ||
|
@@ -2359,7 +2361,7 @@ def _set(x): | |
if is_integer(x): | ||
noconvert_columns.add(x) | ||
else: | ||
noconvert_columns.add(self.columns.index(x)) | ||
noconvert_columns.add(self._col_indices[self.columns.index(x)]) | ||
|
||
if isinstance(self.parse_dates, list): | ||
for val in self.parse_dates: | ||
|
@@ -2709,7 +2711,6 @@ def _infer_columns(self): | |
# overwritten. | ||
self._handle_usecols(columns, names) | ||
else: | ||
self._col_indices = None | ||
num_original_columns = len(names) | ||
columns = [names] | ||
else: | ||
|
@@ -2791,7 +2792,7 @@ def _handle_usecols(self, columns, usecols_key): | |
[n for i, n in enumerate(column) if i in col_indices] | ||
for column in columns | ||
] | ||
self._col_indices = col_indices | ||
self._col_indices = sorted(col_indices) | ||
return columns | ||
|
||
def _buffered_line(self): | ||
|
@@ -3193,8 +3194,7 @@ def _rows_to_cols(self, content): | |
i < len(self.index_col) | ||
# pandas\io\parsers.py:3159: error: Unsupported right | ||
# operand type for in ("Optional[Any]") [operator] | ||
or i - len(self.index_col) # type: ignore[operator] | ||
in self._col_indices | ||
or i - len(self.index_col) in self._col_indices | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you do not ignore mypy checking anymore, then the comments above are irrelevant. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thx, that is a good point. |
||
) | ||
] | ||
else: | ||
|
@@ -3203,7 +3203,7 @@ def _rows_to_cols(self, content): | |
# operand type for in ("Optional[Any]") [operator] | ||
a | ||
for i, a in enumerate(zipped_content) | ||
if i in self._col_indices # type: ignore[operator] | ||
if i in self._col_indices | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here (comment above about mypy error). |
||
] | ||
return zipped_content | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
|
||
from pandas.errors import ParserError | ||
|
||
from pandas import DataFrame, Index, MultiIndex | ||
from pandas import DataFrame, Index, MultiIndex, Timestamp | ||
import pandas._testing as tm | ||
|
||
|
||
|
@@ -314,3 +314,19 @@ def test_malformed_skipfooter(python_parser_only): | |
msg = "Expected 3 fields in line 4, saw 5" | ||
with pytest.raises(ParserError, match=msg): | ||
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) | ||
|
||
|
||
def test_delimiter_with_usecols_and_parse_dates(python_parser_only): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't this not work in c-parser? why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Works for c too, not quite sure why I tested only for python. Moved it |
||
# GH#35873 | ||
result = python_parser_only.read_csv( | ||
StringIO('"dump","-9,1","-9,1",20101010'), | ||
engine="python", | ||
names=["col", "col1", "col2", "col3"], | ||
usecols=["col1", "col2", "col3"], | ||
parse_dates=["col3"], | ||
decimal=",", | ||
) | ||
expected = DataFrame( | ||
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} | ||
) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will need to be removed off of 1.2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm weird, must have missed that- Thx