Skip to content

Commit 46c85e5

Browse files
authored
Fixes unknown_value=np.nan in OrdinalEncoder (#1198)
* update changelogs Signed-off-by: xadupre <[email protected]> * Fix unknown_value=np.nan in OrdinalEncoder Signed-off-by: xadupre <[email protected]> * changelogs Signed-off-by: xadupre <[email protected]> --------- Signed-off-by: xadupre <[email protected]>
1 parent 2462f35 commit 46c85e5

File tree

3 files changed

+74
-13
lines changed

3 files changed

+74
-13
lines changed

CHANGELOGS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Change Logs
22

3+
## 1.20.0
4+
5+
* Fixes unknown_value=np.nan in OrdinalEncoder
6+
[#1198](https://github.com/onnx/sklearn-onnx/issues/1198)
7+
* Enhance OrdinalEncoder conversion to handle infrequent categories
8+
[#1195](https://github.com/onnx/sklearn-onnx/issues/1195)
9+
310
## 1.19.1
411

512
* Fix QDA converter crashing on string labels and incorrect shape calculation

skl2onnx/operator_converters/ordinal_encoder.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,20 @@ def convert_sklearn_ordinal_encoder(
2626
dimension_idx = 0
2727

2828
# handle the 'handle_unknown=use_encoded_value' case
29+
use_float = (
30+
False
31+
if ordinal_op.unknown_value is None
32+
else isinstance(ordinal_op.unknown_value, float)
33+
or np.isnan(ordinal_op.unknown_value)
34+
)
2935
default_value = (
30-
None if ordinal_op.handle_unknown == "error" else int(ordinal_op.unknown_value)
36+
None
37+
if ordinal_op.handle_unknown == "error"
38+
else (
39+
float(ordinal_op.unknown_value)
40+
if use_float
41+
else int(ordinal_op.unknown_value)
42+
)
3143
)
3244

3345
for categories in ordinal_op.categories_:
@@ -113,43 +125,45 @@ def convert_sklearn_ordinal_encoder(
113125
)
114126

115127
# hanlde encoded_missing_value
128+
key = "values_floats" if use_float else "values_int64s"
129+
dtype = np.float32 if use_float else np.int64
116130
if not np.isnan(ordinal_op.encoded_missing_value) and (
117131
isinstance(categories[-1], float) and np.isnan(categories[-1])
118132
):
119133
# sklearn always places np.nan as the last entry
120-
# in its cathegories if it was in the training data
134+
# in its categories if it was in the training data
121135
# => we simply add the 'ordinal_op.encoded_missing_value'
122136
# as our last entry in 'values_int64s' if it was in the training data
123137
encoded_missing_value = np.array(
124138
[int(ordinal_op.encoded_missing_value)]
125-
).astype(np.int64)
139+
).astype(dtype)
126140

127141
# handle max_categories or min_frequency
128142
if default_to_infrequent_mappings is not None:
129-
attrs["values_int64s"] = np.concatenate(
143+
attrs[key] = np.concatenate(
130144
(
131-
np.array(default_to_infrequent_mappings, dtype=np.int64),
145+
np.array(default_to_infrequent_mappings, dtype=dtype),
132146
encoded_missing_value,
133147
)
134148
)
135149
else:
136-
attrs["values_int64s"] = np.concatenate(
150+
attrs[key] = np.concatenate(
137151
(
138-
np.arange(len(categories) - 1).astype(np.int64),
152+
np.arange(len(categories) - 1).astype(dtype),
139153
encoded_missing_value,
140154
)
141155
)
142156
else:
143157
# handle max_categories or min_frequency
144158
if default_to_infrequent_mappings is not None:
145-
attrs["values_int64s"] = np.array(
146-
default_to_infrequent_mappings, dtype=np.int64
147-
)
159+
attrs[key] = np.array(default_to_infrequent_mappings, dtype=dtype)
148160
else:
149-
attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
161+
attrs[key] = np.arange(len(categories)).astype(dtype)
150162

151-
if default_value:
152-
attrs["default_int64"] = default_value
163+
if default_value or (
164+
isinstance(default_value, float) and np.isnan(default_value)
165+
):
166+
attrs["default_float" if use_float else "default_int64"] = default_value
153167

154168
result.append(scope.get_unique_variable_name("ordinal_output"))
155169
label_encoder_output = scope.get_unique_variable_name("label_encoder")

tests/test_sklearn_ordinal_encoder.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,46 @@ def test_model_ordinal_encoder_min_frequency(self):
464464

465465
assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
466466

467+
@unittest.skipIf(
468+
not ordinal_encoder_support(),
469+
reason="OrdinalEncoder was not available before 0.20",
470+
)
471+
def test_model_ordinal_encoder_unknown_value_nan(self):
472+
from onnxruntime import InferenceSession
473+
474+
model = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
475+
data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)
476+
data_with_missing_value = np.array(
477+
[["a"], ["b"], ["c"], ["d"], [np.nan], ["e"], [None]], dtype=np.object_
478+
)
479+
480+
model.fit(data)
481+
# 'np.nan','e' and 'None' become 42.
482+
expected = model.transform(data_with_missing_value)
483+
484+
model_onnx = convert_sklearn(
485+
model,
486+
"scikit-learn ordinal encoder",
487+
[("input", StringTensorType([None, 1]))],
488+
target_opset=TARGET_OPSET,
489+
)
490+
self.assertIsNotNone(model_onnx)
491+
dump_data_and_model(
492+
data, model, model_onnx, basename="SklearnOrdinalEncoderUnknownValue"
493+
)
494+
495+
sess = InferenceSession(
496+
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
497+
)
498+
got = sess.run(
499+
None,
500+
{
501+
"input": data_with_missing_value,
502+
},
503+
)
504+
505+
assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
506+
467507

468508
if __name__ == "__main__":
469509
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)