Fixed links to source files

David Cavazos · davidcavazos · commit 47f959868b29 · 2018-09-13T14:49:41.000-07:00
diff --git a/molecules/README.md b/molecules/README.md
@@ -148,7 +148,7 @@ For small datasets it will be faster to run locally.
 python trainer/task.py
 
 # To get the path of the trained model
-EXPORT_DIR=/tmp/cloudml-samples/molecules/model/export
+EXPORT_DIR=/tmp/cloudml-samples/molecules/model/export/final
 MODEL_DIR=$(ls -d -1 $EXPORT_DIR/* | sort -r | head -n 1)
 ```
 
@@ -168,11 +168,19 @@ gcloud ml-engine jobs submit training $JOB \
   --work-dir $WORK_DIR
 
 # To get the path of the trained model
-EXPORT_DIR=$WORK_DIR/model/export
+EXPORT_DIR=$WORK_DIR/model/export/final
 MODEL_DIR=$(gsutil ls -d $EXPORT_DIR/* | sort -r | head -n 1)
 ```
 
-## Batch Predictions
+To visualize the training job, we can use [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard).
+```bash
+tensorboard --logdir $WORK_DIR/model
+```
+
+You can access the results at `localhost:6006`.
+
+## Predictions
+### Option 1: Batch Predictions
 Source code: [`predict.py`](predict.py)
 
 Batch predictions are optimized for throughput rather than latency. These work best if there's a large amount of predictions to make and you can wait for all of them to finish before having the results.
@@ -204,7 +212,7 @@ python predict.py \
   --outputs-dir $WORK_DIR/predictions
 ```
 
-## Streaming Predictions
+### Option 2: Streaming Predictions
 Source code: [`predict.py`](predict.py)
 
 Streaming predictions are optimized for latency rather than throughput. These work best if you are sending sporadic predictions, but want to get the results as soon as possible.
@@ -252,7 +260,7 @@ Now that we have the prediction service running, we want to run a publisher to s
 For convenience, we provided a sample [`publisher.py`](publisher.py) and [`subscriber.py`](subscriber.py) to show how to implement one.
 
 These will have to be run as different processes concurrently, so you'll need to have a different terminal running each command.
-> NOTE: remember to activate the virtualenv on each terminal.
+> NOTE: remember to activate the `virtualenv` on each terminal.
 
 We'll first run the subscriber, which will listen for prediction results and log them.
 ```bash
@@ -272,3 +280,77 @@ python publisher.py \
 ```
 
 Once the publisher starts parsing and publishing molecules, we'll start seeing predictions from the subscriber.
+
+### Option 3: Cloud ML Engine Predictions
+If you have a different way to extract the features (in this case the atom counts) that is not through our existing preprocessing pipeline for SDF files, it might be easier to build a JSON file with one request per line and make the predictions on Cloud ML Engine.
+
+We've included the [`sample-requests.json`](sample-requests.json) file with an example of how these requests look like. Here are the contents of the file:
+```json
+{"TotalC": 9, "TotalH": 17, "TotalO": 4, "TotalN": 1}
+{"TotalC": 9, "TotalH": 18, "TotalO": 4, "TotalN": 1}
+{"TotalC": 7, "TotalH": 8, "TotalO": 4, "TotalN": 0}
+{"TotalC": 3, "TotalH": 9, "TotalO": 1, "TotalN": 1}
+```
+
+Before creating the model in Cloud ML Engine, it is a good idea to test our model's predictions locally:
+```bash
+# First we have to get the exported model's directory
+EXPORT_DIR=$WORK_DIR/model/export/final
+if [[ $EXPORT_DIR == gs://* ]]; then
+  # If it's a GCS path, use gsutil
+  MODEL_DIR=$(gsutil ls -d $EXPORT_DIR/* | sort -r | head -n 1)
+else
+  # If it's a local path, use ls
+  MODEL_DIR=$(ls -d -1 $EXPORT_DIR/* | sort -r | head -n 1)
+fi
+
+# To do the local predictions
+gcloud ml-engine local predict \
+  --model-dir $MODEL_DIR \
+  --json-instances sample-requests.json
+```
+
+For reference, these are the *real* energy values for the `sample-requests.json` file:
+```
+PREDICTIONS
+[37.801]
+[44.1107]
+[19.4085]
+[-0.1086]
+```
+
+Once we are happy with our results, we can now upload our model into Cloud ML Engine for online predictions.
+```bash
+# We want the model to reside on GCS and get its path
+EXPORT_DIR=$WORK_DIR/model/export/final
+if [[ $EXPORT_DIR == gs://* ]]; then
+  # If it's a GCS path, use gsutil
+  MODEL_DIR=$(gsutil ls -d $EXPORT_DIR/* | sort -r | head -n 1)
+else
+  # If it's a local path, first upload it to GCS
+  LOCAL_MODEL_DIR=$(ls -d -1 $EXPORT_DIR/* | sort -r | head -n 1)
+  MODEL_DIR=$BUCKET/cloudml-samples/molecules/model
+  gsutil -m cp -r $LOCAL_MODEL_DIR $MODEL_DIR
+fi
+
+# Now create the model and a version in Cloud ML Engine and set it as default
+MODEL=molecules
+REGION=$(gcloud config get-value compute/region)
+gcloud ml-engine models create $MODEL \
+  --regions $REGION
+
+VERSION="${MODEL}_$(date +%Y%m%d_%H%M%S)"
+gcloud ml-engine versions create $VERSION \
+  --model $MODEL \
+  --origin $MODEL_DIR \
+  --runtime-version 1.8
+
+gcloud ml-engine versions set-default $VERSION \
+  --model $MODEL
+
+# Finally, we can request predictions via gcloud ml-engine
+gcloud ml-engine predict \
+  --model $MODEL \
+  --version $VERSION \
+  --json-instances sample-requests.json
+```
diff --git a/molecules/data-extractor.py b/molecules/data-extractor.py
@@ -9,6 +9,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This tool downloads SDF files from an FTP source.
+
 import StringIO
 import argparse
 import ftplib
diff --git a/molecules/predict.py b/molecules/predict.py
@@ -11,6 +11,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This tool does either batch or streaming predictions on a trained model.
+
 from __future__ import print_function
 
 import argparse
@@ -93,7 +95,7 @@ def run(model_dir, feature_extraction, sink, beam_options=None):
         | 'Feature extraction' >> feature_extraction
         | 'Predict' >> beam.ParDo(Predict(model_dir, 'ID'))
         | 'Format as JSON' >> beam.Map(lambda result: json.dumps(result))
-        | 'Write to sink' >> sink)
+        | 'Write predictions' >> sink)
 
 
 if __name__ == '__main__':
diff --git a/molecules/preprocess.py b/molecules/preprocess.py
@@ -11,6 +11,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This tool preprocesses and extracts features from SDF files.
+
 import argparse
 import dill as pickle
 import os
diff --git a/molecules/pubchem/pipeline.py b/molecules/pubchem/pipeline.py
@@ -157,7 +157,7 @@ def expand(self, p):
     # Return the preprocessing pipeline. In this case we're reading the PubChem
     # files, but the source could be any Apache Beam source.
     return (p
-        | 'Read from source' >> self.source
+        | 'Read raw molecules' >> self.source
         | 'Format molecule' >> beam.ParDo(FormatMolecule())
         | 'Count atoms' >> beam.ParDo(CountAtoms())
     )
diff --git a/molecules/publisher.py b/molecules/publisher.py
@@ -11,6 +11,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This is a sample publisher for the streaming predictions service.
+
 import argparse
 import os
 import sys
diff --git a/molecules/run-cloud b/molecules/run-cloud
@@ -83,31 +83,11 @@ run gcloud ml-engine jobs submit training $JOB \
 echo ''
 
 # Get the model path
-EXPORT_DIR=$WORK_DIR/model/export
+EXPORT_DIR=$WORK_DIR/model/export/final
 MODEL_DIR=$(gsutil ls -d $EXPORT_DIR/* | sort -r | head -n 1)
 echo "Model: $MODEL_DIR"
 echo ''
 
-# Create a model in Google Cloud ML Engine if it doesn't exist
-MODEL=molecules
-if [[ -z $(gcloud ml-engine models list | awk '{print $1}' | grep "^$MODEL$") ]]; then
-  echo '>> Creating model'
-  run gcloud ml-engine models create $MODEL
-  echo ''
-fi
-
-# Create a model version
-VERSION=$JOB
-echo '>> Creating version'
-run gcloud ml-engine versions create $VERSION \
-  --model $MODEL \
-  --origin $MODEL_DIR \
-  --runtime-version $RUNTIME
-
-run gcloud ml-engine versions set-default $VERSION \
-  --model $MODEL
-echo ''
-
 # Make batch predictions on SDF files
 echo '>> Batch prediction'
 run python predict.py \
diff --git a/molecules/run-local b/molecules/run-local
@@ -49,7 +49,7 @@ run python trainer/task.py \
 echo ''
 
 # Get the model path
-EXPORT_DIR=$WORK_DIR/model/export
+EXPORT_DIR=$WORK_DIR/model/export/final
 if [[ $EXPORT_DIR == gs://* ]]; then
   MODEL_DIR=$(gsutil ls -d $EXPORT_DIR/* | sort -r | head -n 1)
 else
diff --git a/molecules/sample-requests.json b/molecules/sample-requests.json
@@ -0,0 +1,4 @@
+{"TotalC": 9, "TotalH": 17, "TotalO": 4, "TotalN": 1}
+{"TotalC": 9, "TotalH": 18, "TotalO": 4, "TotalN": 1}
+{"TotalC": 7, "TotalH": 8, "TotalO": 4, "TotalN": 0}
+{"TotalC": 3, "TotalH": 9, "TotalO": 1, "TotalN": 1}
diff --git a/molecules/subscriber.py b/molecules/subscriber.py
@@ -11,6 +11,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This is a sample subscriber for the streaming predictions service.
+
 import argparse
 import json
 import logging
diff --git a/molecules/trainer/task.py b/molecules/trainer/task.py
@@ -11,6 +11,8 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+# This tool trains an ML model on preprocessed data.
+
 import argparse
 import dill as pickle
 import multiprocessing as mp
@@ -42,7 +44,7 @@ def decode(elem):
     if mode == tf.estimator.ModeKeys.TRAIN:
       if shuffle:
         dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(
-          batch_size * 8))
+            batch_size * 8))
       else:
         dataset = dataset.cache()
         dataset = dataset.repeat()
@@ -152,21 +154,22 @@ def train_and_evaluate(
   tft_output = tft.TFTransformOutput(work_dir)
   feature_spec = tft_output.transformed_feature_spec()
 
-  # Train the model
-  train_input_fn = make_train_input_fn(
-      feature_spec, labels, train_files_pattern, batch_size)
-  estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)
-
-  # Evaluate the model
-  eval_input_fn = make_eval_input_fn(
-      feature_spec, labels, eval_files_pattern, batch_size)
-  estimator.evaluate(input_fn=eval_input_fn, steps=None)
-
-  # Export the model
-  export_dir = os.path.join(model_dir, 'export')
-  serving_input_fn = make_serving_input_fn(
-      tft_output, input_feature_spec, labels)
-  estimator.export_savedmodel(export_dir, serving_input_fn)
+  # Create the training and evaluation specifications
+  train_spec = tf.estimator.TrainSpec(
+      input_fn=make_train_input_fn(
+          feature_spec, labels, train_files_pattern, batch_size),
+      max_steps=train_max_steps)
+
+  exporter = tf.estimator.FinalExporter(
+      'final', make_serving_input_fn(tft_output, input_feature_spec, labels))
+
+  eval_spec = tf.estimator.EvalSpec(
+      input_fn=make_eval_input_fn(
+          feature_spec, labels, eval_files_pattern, batch_size),
+      exporters=[exporter])
+
+  # Train and evaluate the model
+  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
 
 
 if __name__ == '__main__':

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ def expand(self, p):`
`157`	`157`	`# Return the preprocessing pipeline. In this case we're reading the PubChem`
`158`	`158`	`# files, but the source could be any Apache Beam source.`
`159`	`159`	`return (p`
`160`		`- \| 'Read from source' >> self.source`
	`160`	`+ \| 'Read raw molecules' >> self.source`
`161`	`161`	`\| 'Format molecule' >> beam.ParDo(FormatMolecule())`
`162`	`162`	`\| 'Count atoms' >> beam.ParDo(CountAtoms())`
`163`	`163`	`)`