tensorflow
diff --git a/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎configure/setup.py
Lines changed: 1 addition & 1 deletion b/‎configure/setup.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎documentation/known_issues.md
Lines changed: 51 additions & 1 deletion b/‎documentation/known_issues.md
Lines changed: 51 additions & 1 deletion
diff --git a/‎documentation/tutorials/advanced_colab.ipynb
Lines changed: 72 additions & 1 deletion b/‎documentation/tutorials/advanced_colab.ipynb
Lines changed: 72 additions & 1 deletion
diff --git a/‎documentation/tutorials/beginner_colab.ipynb
Lines changed: 94 additions & 6 deletions b/‎documentation/tutorials/beginner_colab.ipynb
Lines changed: 94 additions & 6 deletions
@@ -1,5 +1,11 @@
 # Changelog
 
+## 0.1.2 - 2021-05-18
+
+### Features
+
+-   Inference engines: QuickScorer Extended and Pred
+
 ## 0.1.1 - 2021-05-17
 
 ### Features
 
@@ -20,7 +20,7 @@
 from setuptools.command.install import install
 from setuptools.dist import Distribution
 
-_VERSION = "0.1.1"
+_VERSION = "0.1.2"
 
 with open("README.md", "r", encoding="utf-8") as fh:
   long_description = fh.read()
 
@@ -6,7 +6,9 @@ TensorFlow and Keras is new, and some issues are expected -- we are trying to
 fix them as quickly as possible.
 
 See also the
-[known issues of Yggdrasil Decision Forests](https://github.com/google/yggdrasil-decision-forests/documentation/known_issues.md).
+[known issues of Yggdrasil Decision Forests](https://github.com/google/yggdrasil-decision-forests/documentation/known_issues.md)
+and the [migration guide](migration.md) for behavior that is different from
+other algorithms.
 
 ## Windows Pip package is not available
 
@@ -33,3 +35,51 @@ an error complaining about tensor shape.
 
 -   *Solution #2:* Wrapps your preprocessing function into another function that
     [squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) its inputs.
+
+## No support for TF distribution strategies.
+
+TF-DF does not yet support distribution strategies or datasets that do not fit
+in memory. This is because the classical decision forest training algorithms
+already implemented require the entire dataset to be available in memory.
+
+**Workaround**
+
+* Downsample your dataset. A rule of thumb is that TF-DF training
+uses 4 bytes per input dimension, so a dataset with 100 million examples and 10
+numerical/categorical features would be 4 GB in memory.
+
+* Train a manual ensemble on slices of the dataset, i.e. train N models on N
+slices of data, and average the predictions.
+
+## No support for training callbacks.
+
+Training callbacks will not get the expected metrics passed to on_epoch_end
+since TF-DF algorithms are trained for only one epoch, and the validation
+data is evaluated before the model is trained. Evaluation callbacks are
+supported.
+
+**Workaround**
+
+By design TF-DF algorithms train for only one epoch, so callbacks that override
+on_epoch_end can be instantiated and called manually with the metrics from
+model.evaluate(). Specifically:
+
+```diff {.bad}
+- cb = tf.keras.callbacks.Callback()
+- model.fit(train_ds, validation_data=val_ds, callbacks=[cb])
+```
+
+```diff {.good}
++ model.fit(train_ds)
++ cb.on_epoch_end(epoch=1, logs=model.evaluate(val_ds, ...))
+```
+
+## No support for GPU / TPU.
+
+TF-DF does not support GPU or TPU training. Compiling with AVX instructions,
+however, may speed up serving.
+
+## No support for [model_to_estimator](https://www.tensorflow.org/api_docs/python/tf/keras/estimator/model_to_estimator)
+
+TF-DF does not implement the APIs required to convert a trained/untrained model
+to the estimator format.
@@ -116,7 +116,31 @@
         "try:\n",
         "  from wurlitzer import sys_pipes\n",
         "except:\n",
-        "  from colabtools.googlelog import CaptureLog as sys_pipes"
+        "  from colabtools.googlelog import CaptureLog as sys_pipes\n",
+        "\n",
+        "from IPython.core.magic import register_line_magic\n",
+        "from IPython.display import Javascript"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XAWSjWrQmVE0"
+      },
+      "outputs": [],
+      "source": [
+        "#@title View results with a max cell height.\n",
+        "\n",
+        "\n",
+        "# Some of the model training logs can cover the full\n",
+        "# screen if not compressed to a smaller viewport.\n",
+        "# This magic allows setting a max height for a cell.\n",
+        "@register_line_magic\n",
+        "def set_cell_height(size):\n",
+        "  display(\n",
+        "      Javascript(\"google.colab.output.setIframeHeight(0, true, {maxHeight: \" +\n",
+        "                 str(size) + \"})\"))"
       ]
     },
     {
@@ -179,6 +203,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
+        "\n",
         "model.summary()"
       ]
     },
@@ -191,6 +217,30 @@
         "Remark the multiple variable importances with name `MEAN_DECREASE_IN_*`."
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xTwmx8A0c4TU"
+      },
+      "source": [
+        "## Plotting the model\n",
+        "\n",
+        "Next, we plot our model.\n",
+        "\n",
+        "A Random Forest is a large model (this model has 300 trees and ~5k nodes; see the summary above). Therefore, we will only plot the first tree, and limit the nodes to depth 3."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZRTrXDz_dIAQ"
+      },
+      "outputs": [],
+      "source": [
+        "tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0, max_depth=3)"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -590,12 +640,33 @@
         "inspector = tfdf.inspector.make_inspector(yggdrasil_model_path)\n",
         "print(\"Input features:\", inspector.features())"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "muW1hgmotx8J"
+      },
+      "source": [
+        "And of course, you can plot the model :)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bqahDVg3t1xM"
+      },
+      "outputs": [],
+      "source": [
+        "tfdf.model_plotter.plot_model_in_colab(manual_model)"
+      ]
     }
   ],
   "metadata": {
     "colab": {
       "collapsed_sections": [],
       "name": "advanced_colab.ipynb",
+      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
 
@@ -161,7 +161,31 @@
         "try:\n",
         "  from wurlitzer import sys_pipes\n",
         "except:\n",
-        "  from colabtools.googlelog import CaptureLog as sys_pipes"
+        "  from colabtools.googlelog import CaptureLog as sys_pipes\n",
+        "\n",
+        "from IPython.core.magic import register_line_magic\n",
+        "from IPython.display import Javascript"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2AhqJz3VmQM-"
+      },
+      "outputs": [],
+      "source": [
+        "#@title View results with a max cell height.\n",
+        "\n",
+        "\n",
+        "# Some of the model training logs can cover the full\n",
+        "# screen if not compressed to a smaller viewport.\n",
+        "# This magic allows setting a max height for a cell.\n",
+        "@register_line_magic\n",
+        "def set_cell_height(size):\n",
+        "  display(\n",
+        "      Javascript(\"google.colab.output.setIframeHeight(0, true, {maxHeight: \" +\n",
+        "                 str(size) + \"})\"))"
       ]
     },
     {
@@ -350,6 +374,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
+        "\n",
         "# Specify the model.\n",
         "model_1 = tfdf.keras.RandomForestModel()\n",
         "\n",
@@ -463,6 +489,47 @@
         "model_1.save(\"/tmp/my_saved_model\")"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6-8R02_SXpbq"
+      },
+      "source": [
+        "## Plot the model\n",
+        "\n",
+        "Plotting a decision tree and following the first branches helps learning about decision forests. In some cases, plotting a model can even be used for debugging.\n",
+        "\n",
+        "Because of the difference in the way they are trained, some models are more interresting to plan than others. Because of the noise injected during training and the depth of the trees, plotting Random Forest is less informative than plotting a CART or the first tree of a Gradient Boosted Tree.\n",
+        "\n",
+        "Never the less, let's plot the first tree of our Random Forest model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KUIxf8N6Yjl0"
+      },
+      "outputs": [],
+      "source": [
+        "tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=3)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cPcL_hDnY7Zy"
+      },
+      "source": [
+        "The root node on the left contains the first condition (`bill_depth_mm \u003e= 16.55`), number of examples (240) and label distribution (the red-blue-green bar).\n",
+        "\n",
+        "Examples that evaluates true to `bill_depth_mm \u003e= 16.55` are branched to the green path. The other ones are branched to the red path.\n",
+        "\n",
+        "The deeper the node, the more `pure` they become i.e. the label distribution is biased toward a subset of classes. \n",
+        "\n",
+        "**Note:** Over the mouse on top of the plot for details."
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -498,6 +565,7 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
         "model_1.summary()"
       ]
     },
@@ -597,6 +665,7 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 150\n",
         "model_1.make_inspector().training_logs()"
       ]
     },
@@ -716,15 +785,22 @@
         "id": "xmzvuI78voD4"
       },
       "source": [
-        "The description of the learning algorithms and their hyper-parameters are also available in the [API reference](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/RandomForestModel) and builtin help:\n",
-        "\n",
-        "```\n",
+        "The description of the learning algorithms and their hyper-parameters are also available in the [API reference](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf) and builtin help:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2hONToBav4DE"
+      },
+      "outputs": [],
+      "source": [
         "# help works anywhere.\n",
         "help(tfdf.keras.RandomForestModel)\n",
         "\n",
         "# ? only works in ipython or notebooks, it usually opens on a separate panel.\n",
-        "tfdf.keras.RandomForestModel?\n",
-        "```"
+        "tfdf.keras.RandomForestModel?"
       ]
     },
     {
@@ -814,6 +890,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
+        "\n",
         "feature_1 = tfdf.keras.FeatureUsage(name=\"year\", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)\n",
         "feature_2 = tfdf.keras.FeatureUsage(name=\"bill_length_mm\")\n",
         "feature_3 = tfdf.keras.FeatureUsage(name=\"sex\")\n",
@@ -971,6 +1049,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
+        "\n",
         "body_mass_g = tf.keras.layers.Input(shape=(1,), name=\"body_mass_g\")\n",
         "body_mass_kg = body_mass_g / 1000.0\n",
         "\n",
@@ -1086,6 +1166,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 300\n",
+        "\n",
         "# Configure the model.\n",
         "model_7 = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)\n",
         "\n",
@@ -1161,6 +1243,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 200\n",
+        "\n",
         "archive_path = tf.keras.utils.get_file(\"letor.zip\",\n",
         "  \"https://download.microsoft.com/download/E/7/E/E7EABEF1-4C7B-4E31-ACE5-73927950ED5E/Letor.zip\",\n",
         "  extract=True)\n",
@@ -1266,6 +1350,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 400\n",
+        "\n",
         "model_8 = tfdf.keras.GradientBoostedTreesModel(\n",
         "    task=tfdf.keras.Task.RANKING,\n",
         "    ranking_group=\"group\",\n",
@@ -1299,6 +1385,8 @@
       },
       "outputs": [],
       "source": [
+        "%set_cell_height 400\n",
+        "\n",
         "model_8.summary()"
       ]
     }