@@ -739,6 +739,16 @@ def fit(self,
739
739
All other fields are filled as usual for `Keras.Mode.fit()`.
740
740
"""
741
741
742
+ # If the dataset was created with "pd_dataframe_to_tf_dataset", ensure that
743
+ # the task is correctly set.
744
+ if hasattr (x , "_tfdf_task" ):
745
+ dataset_task = getattr (x , "_tfdf_task" )
746
+ if dataset_task != self ._task :
747
+ raise ValueError (
748
+ f"The model's `task` attribute ({ Task .Name (self ._task )} ) does "
749
+ "not match the `task` attribute passed to "
750
+ f"`pd_dataframe_to_tf_dataset` ({ Task .Name (dataset_task )} )." )
751
+
742
752
# Call "compile" if the user forgot to do so.
743
753
if not self ._is_compiled :
744
754
self .compile ()
@@ -1005,7 +1015,8 @@ def _batch_size(inputs: Union[tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tensor:
1005
1015
def pd_dataframe_to_tf_dataset (
1006
1016
dataframe ,
1007
1017
label : Optional [str ] = None ,
1008
- task : Optional [TaskType ] = Task .CLASSIFICATION ) -> tf .data .Dataset :
1018
+ task : Optional [TaskType ] = Task .CLASSIFICATION ,
1019
+ max_num_classes : Optional [int ] = 100 ) -> tf .data .Dataset :
1009
1020
"""Converts a Panda Dataframe into a TF Dataset.
1010
1021
1011
1022
Details:
@@ -1025,6 +1036,10 @@ def pd_dataframe_to_tf_dataset(
1025
1036
dataframe: Pandas dataframe containing a training or evaluation dataset.
1026
1037
label: Name of the label column.
1027
1038
task: Target task of the dataset.
1039
+ max_num_classes: Maximum number of classes for a classification task. A high
1040
+ number of unique value / classes might indicate that the problem is a
1041
+ regression or a ranking instead of a classification. Set to None to
1042
+ disable checking the number of classes.
1028
1043
1029
1044
Returns:
1030
1045
A TensorFlow Dataset.
@@ -1035,6 +1050,14 @@ def pd_dataframe_to_tf_dataset(
1035
1050
if task == Task .CLASSIFICATION and label is not None :
1036
1051
classification_classes = dataframe [label ].unique ().tolist ()
1037
1052
classification_classes .sort ()
1053
+ if len (classification_classes ) > max_num_classes :
1054
+ raise ValueError (
1055
+ f"The number of unique classes ({ len (classification_classes )} ) "
1056
+ f"exceeds max_num_classes ({ max_num_classes } ). A high number of "
1057
+ "unique value / classes might indicate that the problem is a "
1058
+ "regression or a ranking instead of a classification. If this "
1059
+ "problem is effectively a classification problem, increase "
1060
+ "`max_num_classes`." )
1038
1061
dataframe [label ] = dataframe [label ].map (classification_classes .index )
1039
1062
1040
1063
# Make sure tha missing values for string columns are not represented as
@@ -1050,7 +1073,10 @@ def pd_dataframe_to_tf_dataset(
1050
1073
tf_dataset = tf .data .Dataset .from_tensor_slices (dict (dataframe ))
1051
1074
1052
1075
# The batch size does not impact the training of TF-DF.
1053
- return tf_dataset .batch (64 )
1076
+ tf_dataset = tf_dataset .batch (64 )
1077
+
1078
+ setattr (tf_dataset , "_tfdf_task" , task )
1079
+ return tf_dataset
1054
1080
1055
1081
1056
1082
def yggdrasil_model_to_keras_model (src_path : str , dst_path : str ):
0 commit comments