Skip to content

Commit

Permalink
BUG: Missing value code not recognised for Stata format version 105 a… (
Browse files Browse the repository at this point in the history
pandas-dev#59325)

* BUG: Missing value code not recognised for Stata format version 105 and earlier

* Move definition of the old missing value constant for the double type out of the loop
  • Loading branch information
cmjcharlton authored Jul 26, 2024
1 parent 0e0814b commit 5af55e0
Show file tree
Hide file tree
Showing 11 changed files with 38 additions and 11 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,7 @@ I/O
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)

Period
^^^^^^
Expand Down
9 changes: 9 additions & 0 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,10 +1817,19 @@ def read(
return data

def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
# missing code for double was different in version 105 and prior
old_missingdouble = float.fromhex("0x1.0p333")

# Check for missing values, and replace if found
replacements = {}
for i in range(len(data.columns)):
fmt = self._typlist[i]
# recode instances of the old missing code to the currently used value
if self._format_version <= 105 and fmt == "d":
data.iloc[:, i] = data.iloc[:, i].replace(
old_missingdouble, self.MISSING_VALUES["d"]
)

if self._format_version <= 111:
if fmt not in self.OLD_VALID_RANGE:
continue
Expand Down
Binary file added pandas/tests/io/data/stata/stata1_102.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata1_103.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata1_104.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata1_105.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata8_102.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata8_103.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata8_104.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata8_105.dta
Binary file not shown.
39 changes: 28 additions & 11 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file):
expected["a"] = expected["a"].astype(np.int32)
tm.assert_frame_equal(read_df, expected, check_index_type=True)

# Note this test starts at format version 108 as the missing code for double
# was different prior to this (see GH 58149) and would therefore fail
@pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119])
@pytest.mark.parametrize(
"version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119]
)
def test_read_dta1(self, version, datapath):
file = datapath("io", "data", "stata", f"stata1_{version}.dta")
parsed = self.read_dta(file)
Expand Down Expand Up @@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file):
)
assert val.string == ".z"

@pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"])
def test_missing_value_conversion(self, file, datapath):
@pytest.mark.parametrize("version", [113, 115, 117])
def test_missing_value_conversion(self, version, datapath):
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
smv = StataMissingValue(101)
keys = sorted(smv.MISSING_VALUES.keys())
Expand All @@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath):
expected = DataFrame(data, columns=columns)

parsed = read_stata(
datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
datapath("io", "data", "stata", f"stata8_{version}.dta"),
convert_missing=True,
)
tm.assert_frame_equal(parsed, expected)

# Note this test starts at format version 108 as the missing code for double
# was different prior to this (see GH 58149) and would therefore fail
@pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"])
def test_missing_value_conversion_compat(self, file, datapath):
@pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
def test_missing_value_conversion_compat(self, version, datapath):
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
smv = StataMissingValue(101)
keys = sorted(smv.MISSING_VALUES.keys())
Expand All @@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath):
expected = DataFrame(data, columns=columns)

parsed = read_stata(
datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
datapath("io", "data", "stata", f"stata8_{version}.dta"),
convert_missing=True,
)
tm.assert_frame_equal(parsed, expected)

# The byte type was not supported prior to the 104 format
@pytest.mark.parametrize("version", [102, 103])
def test_missing_value_conversion_compat_nobyte(self, version, datapath):
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
smv = StataMissingValue(101)
keys = sorted(smv.MISSING_VALUES.keys())
data = []
row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]]
data.append(row)
expected = DataFrame(data, columns=columns)

parsed = read_stata(
datapath("io", "data", "stata", f"stata8_{version}.dta"),
convert_missing=True,
)
tm.assert_frame_equal(parsed, expected)

Expand Down

0 comments on commit 5af55e0

Please sign in to comment.