Fixes unknown_value=np.nan in OrdinalEncoder (#1198)

xadupre · web-flow · commit 46c85e5e7e47 · 2025-07-24T17:47:42.000+02:00
* update changelogs

Signed-off-by: xadupre &lt;xadupre@microsoft.com&gt;

* Fix unknown_value=np.nan in OrdinalEncoder

Signed-off-by: xadupre &lt;xadupre@microsoft.com&gt;

* changelogs

Signed-off-by: xadupre &lt;xadupre@microsoft.com&gt;

---------

Signed-off-by: xadupre &lt;xadupre@microsoft.com&gt;
diff --git a/CHANGELOGS.md b/CHANGELOGS.md
@@ -1,5 +1,12 @@
 # Change Logs
 
+## 1.20.0
+
+* Fixes unknown_value=np.nan in OrdinalEncoder
+  [#1198](https://github.com/onnx/sklearn-onnx/issues/1198)
+* Enhance OrdinalEncoder conversion to handle infrequent categories
+  [#1195](https://github.com/onnx/sklearn-onnx/issues/1195)
+
 ## 1.19.1
 
 * Fix QDA converter crashing on string labels and incorrect shape calculation
diff --git a/skl2onnx/operator_converters/ordinal_encoder.py b/skl2onnx/operator_converters/ordinal_encoder.py
@@ -26,8 +26,20 @@ def convert_sklearn_ordinal_encoder(
     dimension_idx = 0
 
     # handle the 'handle_unknown=use_encoded_value' case
+    use_float = (
+        False
+        if ordinal_op.unknown_value is None
+        else isinstance(ordinal_op.unknown_value, float)
+        or np.isnan(ordinal_op.unknown_value)
+    )
     default_value = (
-        None if ordinal_op.handle_unknown == "error" else int(ordinal_op.unknown_value)
+        None
+        if ordinal_op.handle_unknown == "error"
+        else (
+            float(ordinal_op.unknown_value)
+            if use_float
+            else int(ordinal_op.unknown_value)
+        )
     )
 
     for categories in ordinal_op.categories_:
@@ -113,43 +125,45 @@ def convert_sklearn_ordinal_encoder(
             )
 
         # hanlde encoded_missing_value
+        key = "values_floats" if use_float else "values_int64s"
+        dtype = np.float32 if use_float else np.int64
         if not np.isnan(ordinal_op.encoded_missing_value) and (
             isinstance(categories[-1], float) and np.isnan(categories[-1])
         ):
             # sklearn always places np.nan as the last entry
-            # in its cathegories if it was in the training data
+            # in its categories if it was in the training data
             # => we simply add the 'ordinal_op.encoded_missing_value'
             # as our last entry in 'values_int64s' if it was in the training data
             encoded_missing_value = np.array(
                 [int(ordinal_op.encoded_missing_value)]
-            ).astype(np.int64)
+            ).astype(dtype)
 
             # handle max_categories or min_frequency
             if default_to_infrequent_mappings is not None:
-                attrs["values_int64s"] = np.concatenate(
+                attrs[key] = np.concatenate(
                     (
-                        np.array(default_to_infrequent_mappings, dtype=np.int64),
+                        np.array(default_to_infrequent_mappings, dtype=dtype),
                         encoded_missing_value,
                     )
                 )
             else:
-                attrs["values_int64s"] = np.concatenate(
+                attrs[key] = np.concatenate(
                     (
-                        np.arange(len(categories) - 1).astype(np.int64),
+                        np.arange(len(categories) - 1).astype(dtype),
                         encoded_missing_value,
                     )
                 )
         else:
             # handle max_categories or min_frequency
             if default_to_infrequent_mappings is not None:
-                attrs["values_int64s"] = np.array(
-                    default_to_infrequent_mappings, dtype=np.int64
-                )
+                attrs[key] = np.array(default_to_infrequent_mappings, dtype=dtype)
             else:
-                attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
+                attrs[key] = np.arange(len(categories)).astype(dtype)
 
-        if default_value:
-            attrs["default_int64"] = default_value
+        if default_value or (
+            isinstance(default_value, float) and np.isnan(default_value)
+        ):
+            attrs["default_float" if use_float else "default_int64"] = default_value
 
         result.append(scope.get_unique_variable_name("ordinal_output"))
         label_encoder_output = scope.get_unique_variable_name("label_encoder")
diff --git a/tests/test_sklearn_ordinal_encoder.py b/tests/test_sklearn_ordinal_encoder.py
@@ -464,6 +464,46 @@ def test_model_ordinal_encoder_min_frequency(self):
 
         assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
 
+    @unittest.skipIf(
+        not ordinal_encoder_support(),
+        reason="OrdinalEncoder was not available before 0.20",
+    )
+    def test_model_ordinal_encoder_unknown_value_nan(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
+        data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)
+        data_with_missing_value = np.array(
+            [["a"], ["b"], ["c"], ["d"], [np.nan], ["e"], [None]], dtype=np.object_
+        )
+
+        model.fit(data)
+        # 'np.nan','e' and 'None' become 42.
+        expected = model.transform(data_with_missing_value)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data, model, model_onnx, basename="SklearnOrdinalEncoderUnknownValue"
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data_with_missing_value,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)