Skip to content

Commit

Permalink
Internal change
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 375931435
  • Loading branch information
achoum authored and copybara-github committed May 26, 2021
1 parent 48faf05 commit 27c3e8e
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 2 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

## 0.1.5 - ????

### Features

```
- Raise an error of the number of classes is greater than 100 (can be disabled).
- Raise an error if the model's task does not match the `pd_dataframe_to_tf_dataset`'s task.
```

### Bug fix

- Fix failure when input feature contains commas.
Expand Down
30 changes: 28 additions & 2 deletions tensorflow_decision_forests/keras/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,16 @@ def fit(self,
All other fields are filled as usual for `Keras.Mode.fit()`.
"""

# If the dataset was created with "pd_dataframe_to_tf_dataset", ensure that
# the task is correctly set.
if hasattr(x, "_tfdf_task"):
dataset_task = getattr(x, "_tfdf_task")
if dataset_task != self._task:
raise ValueError(
f"The model's `task` attribute ({Task.Name(self._task)}) does "
"not match the `task` attribute passed to "
f"`pd_dataframe_to_tf_dataset` ({Task.Name(dataset_task)}).")

# Call "compile" if the user forgot to do so.
if not self._is_compiled:
self.compile()
Expand Down Expand Up @@ -1005,7 +1015,8 @@ def _batch_size(inputs: Union[tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tensor:
def pd_dataframe_to_tf_dataset(
dataframe,
label: Optional[str] = None,
task: Optional[TaskType] = Task.CLASSIFICATION) -> tf.data.Dataset:
task: Optional[TaskType] = Task.CLASSIFICATION,
max_num_classes: Optional[int] = 100) -> tf.data.Dataset:
"""Converts a Panda Dataframe into a TF Dataset.
Details:
Expand All @@ -1025,6 +1036,10 @@ def pd_dataframe_to_tf_dataset(
dataframe: Pandas dataframe containing a training or evaluation dataset.
label: Name of the label column.
task: Target task of the dataset.
max_num_classes: Maximum number of classes for a classification task. A high
number of unique value / classes might indicate that the problem is a
regression or a ranking instead of a classification. Set to None to
disable checking the number of classes.
Returns:
A TensorFlow Dataset.
Expand All @@ -1035,6 +1050,14 @@ def pd_dataframe_to_tf_dataset(
if task == Task.CLASSIFICATION and label is not None:
classification_classes = dataframe[label].unique().tolist()
classification_classes.sort()
if len(classification_classes) > max_num_classes:
raise ValueError(
f"The number of unique classes ({len(classification_classes)}) "
f"exceeds max_num_classes ({max_num_classes}). A high number of "
"unique value / classes might indicate that the problem is a "
"regression or a ranking instead of a classification. If this "
"problem is effectively a classification problem, increase "
"`max_num_classes`.")
dataframe[label] = dataframe[label].map(classification_classes.index)

# Make sure tha missing values for string columns are not represented as
Expand All @@ -1050,7 +1073,10 @@ def pd_dataframe_to_tf_dataset(
tf_dataset = tf.data.Dataset.from_tensor_slices(dict(dataframe))

# The batch size does not impact the training of TF-DF.
return tf_dataset.batch(64)
tf_dataset = tf_dataset.batch(64)

setattr(tf_dataset, "_tfdf_task", task)
return tf_dataset


def yggdrasil_model_to_keras_model(src_path: str, dst_path: str):
Expand Down
14 changes: 14 additions & 0 deletions tensorflow_decision_forests/keras/keras_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,20 @@ def test_feature_with_comma(self):
dataset = pd.DataFrame({"a,b": [0, 1, 2], "label": [0, 1, 2]})
model.fit(keras.pd_dataframe_to_tf_dataset(dataset, label="label"))

def test_error_too_much_classes(self):
dataframe = pd.DataFrame({"x": list(range(10)), "label": list(range(10))})
with self.assertRaises(ValueError):
keras.pd_dataframe_to_tf_dataset(
dataframe, label="label", max_num_classes=5)

def test_error_non_matching_task(self):
dataframe = pd.DataFrame({"x": list(range(10)), "label": list(range(10))})
dataset = keras.pd_dataframe_to_tf_dataset(
dataframe, label="label", task=keras.Task.CLASSIFICATION)
model = keras.GradientBoostedTreesModel(task=keras.Task.REGRESSION)
with self.assertRaises(ValueError):
model.fit(dataset)


if __name__ == "__main__":
tf.test.main()

0 comments on commit 27c3e8e

Please sign in to comment.