@@ -739,6 +739,16 @@ def fit(self,
739739 All other fields are filled as usual for `Keras.Mode.fit()`.
740740 """
741741
742+ # If the dataset was created with "pd_dataframe_to_tf_dataset", ensure that
743+ # the task is correctly set.
744+ if hasattr (x , "_tfdf_task" ):
745+ dataset_task = getattr (x , "_tfdf_task" )
746+ if dataset_task != self ._task :
747+ raise ValueError (
748+ f"The model's `task` attribute ({ Task .Name (self ._task )} ) does "
749+ "not match the `task` attribute passed to "
750+ f"`pd_dataframe_to_tf_dataset` ({ Task .Name (dataset_task )} )." )
751+
742752 # Call "compile" if the user forgot to do so.
743753 if not self ._is_compiled :
744754 self .compile ()
@@ -1005,7 +1015,8 @@ def _batch_size(inputs: Union[tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tensor:
10051015def pd_dataframe_to_tf_dataset (
10061016 dataframe ,
10071017 label : Optional [str ] = None ,
1008- task : Optional [TaskType ] = Task .CLASSIFICATION ) -> tf .data .Dataset :
1018+ task : Optional [TaskType ] = Task .CLASSIFICATION ,
1019+ max_num_classes : Optional [int ] = 100 ) -> tf .data .Dataset :
10091020 """Converts a Panda Dataframe into a TF Dataset.
10101021
10111022 Details:
@@ -1025,6 +1036,10 @@ def pd_dataframe_to_tf_dataset(
10251036 dataframe: Pandas dataframe containing a training or evaluation dataset.
10261037 label: Name of the label column.
10271038 task: Target task of the dataset.
1039+ max_num_classes: Maximum number of classes for a classification task. A high
1040+ number of unique value / classes might indicate that the problem is a
1041+ regression or a ranking instead of a classification. Set to None to
1042+ disable checking the number of classes.
10281043
10291044 Returns:
10301045 A TensorFlow Dataset.
@@ -1035,6 +1050,14 @@ def pd_dataframe_to_tf_dataset(
10351050 if task == Task .CLASSIFICATION and label is not None :
10361051 classification_classes = dataframe [label ].unique ().tolist ()
10371052 classification_classes .sort ()
1053+ if len (classification_classes ) > max_num_classes :
1054+ raise ValueError (
1055+ f"The number of unique classes ({ len (classification_classes )} ) "
1056+ f"exceeds max_num_classes ({ max_num_classes } ). A high number of "
1057+ "unique value / classes might indicate that the problem is a "
1058+ "regression or a ranking instead of a classification. If this "
1059+ "problem is effectively a classification problem, increase "
1060+ "`max_num_classes`." )
10381061 dataframe [label ] = dataframe [label ].map (classification_classes .index )
10391062
10401063 # Make sure tha missing values for string columns are not represented as
@@ -1050,7 +1073,10 @@ def pd_dataframe_to_tf_dataset(
10501073 tf_dataset = tf .data .Dataset .from_tensor_slices (dict (dataframe ))
10511074
10521075 # The batch size does not impact the training of TF-DF.
1053- return tf_dataset .batch (64 )
1076+ tf_dataset = tf_dataset .batch (64 )
1077+
1078+ setattr (tf_dataset , "_tfdf_task" , task )
1079+ return tf_dataset
10541080
10551081
10561082def yggdrasil_model_to_keras_model (src_path : str , dst_path : str ):
0 commit comments