Added xgboost_script_mode_local_training_and_serving sample

eitansela · eitansela · commit ceb62be4c872 · 2021-06-03T18:23:32.000+03:00
diff --git a/tensorflow_script_mode_california_housing_local_training_and_serving/tensorflow_script_mode_california_housing_local_training_and_serving.py b/tensorflow_script_mode_california_housing_local_training_and_serving/tensorflow_script_mode_california_housing_local_training_and_serving.py
@@ -1,4 +1,4 @@
-# This is a sample Python program that trains a simple TensorFlow CIFAR-10 model.
+# This is a sample Python program that trains a simple TensorFlow California Housing model.
 # This implementation will work on your *local computer* or in the *AWS Cloud*.
 # To run training and inference *locally* set: `config = get_config(LOCAL_MODE)`
 # To run training and inference on the *cloud* set: `config = get_config(CLOUD_MODE)` and set a valid IAM role value in get_config()
diff --git a/xgboost_script_mode_local_training_and_serving/code/abalone.py b/xgboost_script_mode_local_training_and_serving/code/abalone.py
@@ -0,0 +1,133 @@
+#  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License").
+#  You may not use this file except in compliance with the License.
+#  A copy of the License is located at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  or in the "license" file accompanying this file. This file is distributed
+#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+#  express or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import os
+import pickle as pkl
+
+import pandas as pd
+import xgboost as xgb
+from sagemaker_containers import entry_point
+from sagemaker_xgboost_container import distributed
+from sagemaker_xgboost_container.data_utils import get_dmatrix
+
+
+def _xgb_train(params, dtrain, evals, num_boost_round, model_dir, is_master):
+    """Run xgb train on arguments given with rabit initialized.
+
+    This is our rabit execution function.
+
+    :param args_dict: Argument dictionary used to run xgb.train().
+    :param is_master: True if current node is master host in distributed training,
+                        or is running single node training job.
+                        Note that rabit_run will include this argument.
+    """
+    booster = xgb.train(params=params, dtrain=dtrain, evals=evals, num_boost_round=num_boost_round)
+
+    if is_master:
+        model_location = model_dir + "/xgboost-model"
+        pkl.dump(booster, open(model_location, "wb"))
+        logging.info("Stored trained model at {}".format(model_location))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Hyperparameters are described here.
+    parser.add_argument(
+        "--max_depth",
+        type=int,
+    )
+    parser.add_argument("--eta", type=float)
+    parser.add_argument("--gamma", type=int)
+    parser.add_argument("--min_child_weight", type=int)
+    parser.add_argument("--subsample", type=float)
+    parser.add_argument("--verbosity", type=int)
+    parser.add_argument("--objective", type=str)
+    parser.add_argument("--num_round", type=int)
+    parser.add_argument("--tree_method", type=str, default="auto")
+    parser.add_argument("--predictor", type=str, default="auto")
+
+    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
+    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
+    parser.add_argument("--validation", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
+    parser.add_argument("--sm_hosts", type=str, default=os.environ.get("SM_HOSTS"))
+    parser.add_argument("--sm_current_host", type=str, default=os.environ.get("SM_CURRENT_HOST"))
+
+    args, _ = parser.parse_known_args()
+
+    # Get SageMaker host information from runtime environment variables
+    sm_hosts = json.loads(args.sm_hosts)
+    sm_current_host = args.sm_current_host
+
+    dtrain = get_dmatrix(args.train, "libsvm")
+    dval = get_dmatrix(args.validation, "libsvm")
+    watchlist = (
+        [(dtrain, "train"), (dval, "validation")] if dval is not None else [(dtrain, "train")]
+    )
+
+    train_hp = {
+        "max_depth": args.max_depth,
+        "eta": args.eta,
+        "gamma": args.gamma,
+        "min_child_weight": args.min_child_weight,
+        "subsample": args.subsample,
+        "verbosity": args.verbosity,
+        "objective": args.objective,
+        "tree_method": args.tree_method,
+        "predictor": args.predictor,
+    }
+
+    xgb_train_args = dict(
+        params=train_hp,
+        dtrain=dtrain,
+        evals=watchlist,
+        num_boost_round=args.num_round,
+        model_dir=args.model_dir,
+    )
+
+    if len(sm_hosts) > 1:
+        # Wait until all hosts are able to find each other
+        entry_point._wait_hostname_resolution()
+
+        # Execute training function after initializing rabit.
+        distributed.rabit_run(
+            exec_fun=_xgb_train,
+            args=xgb_train_args,
+            include_in_training=(dtrain is not None),
+            hosts=sm_hosts,
+            current_host=sm_current_host,
+            update_rabit_args=True,
+        )
+    else:
+        # If single node training, call training method directly.
+        if dtrain:
+            xgb_train_args["is_master"] = True
+            _xgb_train(**xgb_train_args)
+        else:
+            raise ValueError("Training channel must have data to train model.")
+
+
+def model_fn(model_dir):
+    """Deserialize and return fitted model.
+
+    Note that this should have the same name as the serialized model in the _xgb_train method
+    """
+    model_file = "xgboost-model"
+    booster = pkl.load(open(os.path.join(model_dir, model_file), "rb"))
+    return booster
diff --git a/xgboost_script_mode_local_training_and_serving/code/inference.py b/xgboost_script_mode_local_training_and_serving/code/inference.py
@@ -0,0 +1,63 @@
+#  Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License").
+#  You may not use this file except in compliance with the License.
+#  A copy of the License is located at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  or in the "license" file accompanying this file. This file is distributed
+#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+#  express or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+import json
+import os
+import pickle as pkl
+
+import numpy as np
+import sagemaker_xgboost_container.encoder as xgb_encoders
+
+
+def model_fn(model_dir):
+    """
+    Deserialize and return fitted model.
+    """
+    model_file = "xgboost-model"
+    booster = pkl.load(open(os.path.join(model_dir, model_file), "rb"))
+    return booster
+
+
+def input_fn(request_body, request_content_type):
+    """
+    The SageMaker XGBoost model server receives the request data body and the content type,
+    and invokes the `input_fn`.
+
+    Return a DMatrix (an object that can be passed to predict_fn).
+    """
+    if request_content_type == "text/libsvm":
+        return xgb_encoders.libsvm_to_dmatrix(request_body)
+    else:
+        raise ValueError("Content type {} is not supported.".format(request_content_type))
+
+
+def predict_fn(input_data, model):
+    """
+    SageMaker XGBoost model server invokes `predict_fn` on the return value of `input_fn`.
+
+    Return a two-dimensional NumPy array where the first columns are predictions
+    and the remaining columns are the feature contributions (SHAP values) for that prediction.
+    """
+    prediction = model.predict(input_data)
+    feature_contribs = model.predict(input_data, pred_contribs=True, validate_features=False)
+    output = np.hstack((prediction[:, np.newaxis], feature_contribs))
+    return output
+
+
+def output_fn(predictions, content_type):
+    """
+    After invoking predict_fn, the model server invokes `output_fn`.
+    """
+    if content_type == "text/csv":
+        return ",".join(str(x) for x in predictions[0])
+    else:
+        raise ValueError("Content type {} is not supported.".format(content_type))
diff --git a/xgboost_script_mode_local_training_and_serving/requirements.txt b/xgboost_script_mode_local_training_and_serving/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+pandas
+sagemaker>=2.0.0<3.0.0
+sagemaker[local]
diff --git a/xgboost_script_mode_local_training_and_serving/xgboost_script_mode_local_training_and_serving.py b/xgboost_script_mode_local_training_and_serving/xgboost_script_mode_local_training_and_serving.py
@@ -0,0 +1,84 @@
+# This is a sample Python program that trains a simple XGBoost model on Abalone dataset.
+# This implementation will work on your *local computer* or in the *AWS Cloud*.
+# To run training and inference *locally* set: `config = get_config(LOCAL_MODE)`
+# To run training and inference on the *cloud* set: `config = get_config(CLOUD_MODE)` and set a valid IAM role value in get_config()
+#
+# Prerequisites:
+#   1. Install required Python packages:
+#      `pip install -r requirements.txt`
+#   2. Docker Desktop installed and running on your computer:
+#      `docker ps`
+#   3. You should have AWS credentials configured on your local machine
+#      in order to be able to pull the docker image from ECR.
+###############################################################################################
+
+from sagemaker import TrainingInput
+from sagemaker.xgboost import XGBoost, XGBoostModel
+
+DUMMY_IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'
+
+
+def do_inference_on_local_endpoint(predictor, libsvm_str):
+    label, *features = libsvm_str.strip().split()
+    predictions = predictor.predict(" ".join(["-99"] + features))  # use dummy label -99
+    print("Prediction: {}".format(predictions))
+
+
+def main():
+    print('Starting model training.')
+    print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
+
+    hyperparameters = {
+        "max_depth": "5",
+        "eta": "0.2",
+        "gamma": "4",
+        "min_child_weight": "6",
+        "subsample": "0.7",
+        "objective": "reg:squarederror",
+        "num_round": "50",
+        "verbosity": "2",
+    }
+
+    xgb_script_mode_estimator = XGBoost(
+        entry_point="./code/abalone.py",
+        hyperparameters=hyperparameters,
+        role=DUMMY_IAM_ROLE,
+        instance_count=1,
+        instance_type='local',
+        framework_version="1.2-1"
+    )
+
+    train_input = TrainingInput("s3://xgboost-script-mode-local-training-and-serving/train/abalone", content_type="text/libsvm")
+
+    xgb_script_mode_estimator.fit({"train": train_input, "validation": train_input})
+
+    print('Completed model training')
+
+    model_data = xgb_script_mode_estimator.model_data
+    print(model_data)
+
+    xgb_inference_model = XGBoostModel(
+        model_data=model_data,
+        role=DUMMY_IAM_ROLE,
+        entry_point="./code/inference.py",
+        framework_version="1.2-1",
+    )
+
+    print('Deploying endpoint in local mode')
+    predictor = xgb_inference_model.deploy(
+        initial_instance_count=1,
+        instance_type="local",
+    )
+
+    a_young_abalone = "6 1:3 2:0.37 3:0.29 4:0.095 5:0.249 6:0.1045 7:0.058 8:0.067"
+    do_inference_on_local_endpoint(predictor, a_young_abalone)
+
+    an_old_abalone = "15 1:1 2:0.655 3:0.53 4:0.175 5:1.2635 6:0.486 7:0.2635 8:0.415"
+    do_inference_on_local_endpoint(predictor, an_old_abalone)
+
+    print('About to delete the endpoint to stop paying (if in cloud mode).')
+    predictor.delete_endpoint(predictor.endpoint_name)
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# This is a sample Python program that trains a simple TensorFlow CIFAR-10 model.`
	`1`	`+# This is a sample Python program that trains a simple TensorFlow California Housing model.`
`2`	`2`	`# This implementation will work on your local computer or in the AWS Cloud.`
`3`	`3`	# To run training and inference locally set: `config = get_config(LOCAL_MODE)`
`4`	`4`	# To run training and inference on the cloud set: `config = get_config(CLOUD_MODE)` and set a valid IAM role value in get_config()