@@ -26,8 +26,20 @@ def convert_sklearn_ordinal_encoder(
26
26
dimension_idx = 0
27
27
28
28
# handle the 'handle_unknown=use_encoded_value' case
29
+ use_float = (
30
+ False
31
+ if ordinal_op .unknown_value is None
32
+ else isinstance (ordinal_op .unknown_value , float )
33
+ or np .isnan (ordinal_op .unknown_value )
34
+ )
29
35
default_value = (
30
- None if ordinal_op .handle_unknown == "error" else int (ordinal_op .unknown_value )
36
+ None
37
+ if ordinal_op .handle_unknown == "error"
38
+ else (
39
+ float (ordinal_op .unknown_value )
40
+ if use_float
41
+ else int (ordinal_op .unknown_value )
42
+ )
31
43
)
32
44
33
45
for categories in ordinal_op .categories_ :
@@ -113,43 +125,45 @@ def convert_sklearn_ordinal_encoder(
113
125
)
114
126
115
127
# hanlde encoded_missing_value
128
+ key = "values_floats" if use_float else "values_int64s"
129
+ dtype = np .float32 if use_float else np .int64
116
130
if not np .isnan (ordinal_op .encoded_missing_value ) and (
117
131
isinstance (categories [- 1 ], float ) and np .isnan (categories [- 1 ])
118
132
):
119
133
# sklearn always places np.nan as the last entry
120
- # in its cathegories if it was in the training data
134
+ # in its categories if it was in the training data
121
135
# => we simply add the 'ordinal_op.encoded_missing_value'
122
136
# as our last entry in 'values_int64s' if it was in the training data
123
137
encoded_missing_value = np .array (
124
138
[int (ordinal_op .encoded_missing_value )]
125
- ).astype (np . int64 )
139
+ ).astype (dtype )
126
140
127
141
# handle max_categories or min_frequency
128
142
if default_to_infrequent_mappings is not None :
129
- attrs ["values_int64s" ] = np .concatenate (
143
+ attrs [key ] = np .concatenate (
130
144
(
131
- np .array (default_to_infrequent_mappings , dtype = np . int64 ),
145
+ np .array (default_to_infrequent_mappings , dtype = dtype ),
132
146
encoded_missing_value ,
133
147
)
134
148
)
135
149
else :
136
- attrs ["values_int64s" ] = np .concatenate (
150
+ attrs [key ] = np .concatenate (
137
151
(
138
- np .arange (len (categories ) - 1 ).astype (np . int64 ),
152
+ np .arange (len (categories ) - 1 ).astype (dtype ),
139
153
encoded_missing_value ,
140
154
)
141
155
)
142
156
else :
143
157
# handle max_categories or min_frequency
144
158
if default_to_infrequent_mappings is not None :
145
- attrs ["values_int64s" ] = np .array (
146
- default_to_infrequent_mappings , dtype = np .int64
147
- )
159
+ attrs [key ] = np .array (default_to_infrequent_mappings , dtype = dtype )
148
160
else :
149
- attrs ["values_int64s" ] = np .arange (len (categories )).astype (np . int64 )
161
+ attrs [key ] = np .arange (len (categories )).astype (dtype )
150
162
151
- if default_value :
152
- attrs ["default_int64" ] = default_value
163
+ if default_value or (
164
+ isinstance (default_value , float ) and np .isnan (default_value )
165
+ ):
166
+ attrs ["default_float" if use_float else "default_int64" ] = default_value
153
167
154
168
result .append (scope .get_unique_variable_name ("ordinal_output" ))
155
169
label_encoder_output = scope .get_unique_variable_name ("label_encoder" )
0 commit comments