BUG: Missing value code not recognised for Stata format version 105 a… (

pandas-dev#59325) * BUG: Missing value code not recognised for Stata format version 105 and earlier * Move definition of the old missing value constant for the double type out of the loop
natmokval · Jul 26, 2024 · 5af55e0 · 5af55e0
1 parent 0e0814b
commit 5af55e0
Show file tree

Hide file tree

Showing 11 changed files with 38 additions and 11 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -584,6 +584,7 @@ I/O
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
 - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
+- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
 
 Period
 ^^^^^^

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1817,10 +1817,19 @@ def read(
         return data
 
     def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
+        # missing code for double was different in version 105 and prior
+        old_missingdouble = float.fromhex("0x1.0p333")
+
         # Check for missing values, and replace if found
         replacements = {}
         for i in range(len(data.columns)):
             fmt = self._typlist[i]
+            # recode instances of the old missing code to the currently used value
+            if self._format_version <= 105 and fmt == "d":
+                data.iloc[:, i] = data.iloc[:, i].replace(
+                    old_missingdouble, self.MISSING_VALUES["d"]
+                )
+
             if self._format_version <= 111:
                 if fmt not in self.OLD_VALID_RANGE:
                     continue

diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta
diff --git a/pandas/tests/io/data/stata/stata1_103.dta b/pandas/tests/io/data/stata/stata1_103.dta
diff --git a/pandas/tests/io/data/stata/stata1_104.dta b/pandas/tests/io/data/stata/stata1_104.dta
diff --git a/pandas/tests/io/data/stata/stata1_105.dta b/pandas/tests/io/data/stata/stata1_105.dta
diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta
diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta
diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta
diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file):
         expected["a"] = expected["a"].astype(np.int32)
         tm.assert_frame_equal(read_df, expected, check_index_type=True)
 
-    # Note this test starts at format version 108 as the missing code for double
-    # was different prior to this (see GH 58149) and would therefore fail
-    @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119])
+    @pytest.mark.parametrize(
+        "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119]
+    )
     def test_read_dta1(self, version, datapath):
         file = datapath("io", "data", "stata", f"stata1_{version}.dta")
         parsed = self.read_dta(file)
@@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file):
         )
         assert val.string == ".z"
 
-    @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"])
-    def test_missing_value_conversion(self, file, datapath):
+    @pytest.mark.parametrize("version", [113, 115, 117])
+    def test_missing_value_conversion(self, version, datapath):
         columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
         smv = StataMissingValue(101)
         keys = sorted(smv.MISSING_VALUES.keys())
@@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath):
         expected = DataFrame(data, columns=columns)
 
         parsed = read_stata(
-            datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
         )
         tm.assert_frame_equal(parsed, expected)
 
-    # Note this test starts at format version 108 as the missing code for double
-    # was different prior to this (see GH 58149) and would therefore fail
-    @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"])
-    def test_missing_value_conversion_compat(self, file, datapath):
+    @pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
+    def test_missing_value_conversion_compat(self, version, datapath):
         columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
         smv = StataMissingValue(101)
         keys = sorted(smv.MISSING_VALUES.keys())
@@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath):
         expected = DataFrame(data, columns=columns)
 
         parsed = read_stata(
-            datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    # The byte type was not supported prior to the 104 format
+    @pytest.mark.parametrize("version", [102, 103])
+    def test_missing_value_conversion_compat_nobyte(self, version, datapath):
+        columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
+        smv = StataMissingValue(101)
+        keys = sorted(smv.MISSING_VALUES.keys())
+        data = []
+        row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]]
+        data.append(row)
+        expected = DataFrame(data, columns=columns)
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
         )
         tm.assert_frame_equal(parsed, expected)