Skip to content

Commit 6dcd442

Browse files
malav-shastriMalav Shastri
and
Malav Shastri
authored
Upgrade XgBoost to 1.7.x (#362)
* Upgrade XgBoost version to 1.7.3 * Restructure checkpointing.py and fix Unit tests * Resolve Flake8 style errors * Address Comments and improvements * Add comments and improvements * Upgrade xgboost to 1.7.4 * remove grow_local_histmaker and Single precision histogram --------- Co-authored-by: Malav Shastri <[email protected]>
1 parent 3aa31e9 commit 6dcd442

File tree

14 files changed

+167
-137
lines changed

14 files changed

+167
-137
lines changed

README.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -253,4 +253,4 @@ SageMaker XGboost Framework Container is licensed under the Apache 2.0 License.
253253
.com, Inc. or its affiliates. All Rights Reserved. The license is available at:
254254
http://aws.amazon.com/apache2.0/
255255

256-
.. |XGBoostLatestVersion| replace:: 1.5-1
256+
.. |XGBoostLatestVersion| replace:: 1.7-1

docker/1.5-1/base/Dockerfile.cpu docker/1.7-1/base/Dockerfile.cpu

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ ARG CONDA_PKG_VERSION=4.10.1
1111
ARG PYTHON_VERSION=3.8.13
1212
ARG PYARROW_VERSION=1.0
1313
ARG MLIO_VERSION=0.7.0
14-
ARG XGBOOST_VERSION=1.5.2
14+
ARG XGBOOST_VERSION=1.7.4
1515

1616
ENV DEBIAN_FRONTEND=noninteractive
1717
ENV LANG=C.UTF-8
File renamed without changes.

docker/1.5-1/final/Dockerfile.cpu docker/1.7-1/final/Dockerfile.cpu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG SAGEMAKER_XGBOOST_VERSION=1.5-1
1+
ARG SAGEMAKER_XGBOOST_VERSION=1.7-1
22
ARG PYTHON_VERSION=3.8
33

44
FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ PyYAML==5.4.1
33
Pillow==9.1.1
44
boto3==1.17.52
55
botocore==1.20.52
6-
cryptography==35.0.0
6+
cryptography==39.0.1
77
dask==2022.11.1
88
dask-cuda==22.12.0
99
gunicorn==19.10.0

src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def updater_validator(value, dependencies):
2929
"grow_colmaker",
3030
"distcol",
3131
"grow_histmaker",
32-
"grow_local_histmaker",
3332
"grow_skmaker",
3433
"sync",
3534
"refresh",
@@ -40,7 +39,6 @@ def updater_validator(value, dependencies):
4039
"grow_colmaker",
4140
"distcol",
4241
"grow_histmaker",
43-
"grow_local_histmaker",
4442
"grow_colmaker",
4543
"grow_quantile_histmaker",
4644
]
@@ -62,7 +60,7 @@ def updater_validator(value, dependencies):
6260
if not all(x in valid_tree_plugins for x in value):
6361
raise exc.UserError(
6462
"Tree updater should be selected from these options: 'grow_colmaker', 'distcol', 'grow_histmaker', "
65-
"'grow_local_histmaker', 'grow_skmaker', 'grow_quantile_histmaker', 'sync', 'refresh', 'prune', "
63+
"'grow_skmaker', 'grow_quantile_histmaker', 'sync', 'refresh', 'prune', "
6664
"'shortgun', 'coord_descent'."
6765
)
6866
# validate only one tree updater is selected
@@ -74,7 +72,7 @@ def updater_validator(value, dependencies):
7472
raise exc.UserError(
7573
"Only one tree grow plugin can be selected. Choose one from the"
7674
"following: 'grow_colmaker', 'distcol', 'grow_histmaker', "
77-
"'grow_local_histmaker', 'grow_skmaker'"
75+
"'grow_skmaker'"
7876
)
7977

8078
@hpv.range_validator(["auto", "cpu_predictor", "gpu_predictor"])
@@ -239,15 +237,13 @@ def interaction_constraints_validator(value, dependencies):
239237
"grow_colmaker",
240238
"distcol",
241239
"grow_histmaker",
242-
"grow_local_histmaker",
243240
"grow_skmaker",
244241
"sync",
245242
"refresh",
246243
"prune",
247244
"grow_colmaker",
248245
"distcol",
249246
"grow_histmaker",
250-
"grow_local_histmaker",
251247
"grow_colmaker",
252248
"shotgun",
253249
"coord_descent",
@@ -334,7 +330,6 @@ def interaction_constraints_validator(value, dependencies):
334330
hpv.ContinuousHyperparameter(
335331
name="aft_loss_distribution_scale", range=hpv.Interval(min_closed=0), required=False
336332
),
337-
hpv.CategoricalHyperparameter(name="single_precision_histogram", range=["true", "false"], required=False),
338333
hpv.CategoricalHyperparameter(name="deterministic_histogram", range=["true", "false"], required=False),
339334
hpv.CategoricalHyperparameter(name="sampling_method", range=["uniform", "gradient_based"], required=False),
340335
hpv.IntegerHyperparameter(name="prob_buffer_row", range=hpv.Interval(min_open=1.0), required=False),

src/sagemaker_xgboost_container/checkpointing.py

+107-71
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
import threading
77

88
import xgboost as xgb
9+
from typing import Optional
910
from xgboost import rabit
10-
from xgboost.callback import _fmt_metric as format_metric
11-
from xgboost.core import Booster, XGBoostError
11+
from xgboost.callback import EvaluationMonitor
12+
from xgboost.core import XGBoostError
1213

1314
TEMP_FILE_SUFFIX = ".sagemaker-ignore"
1415
FILE_LOCK_SUFFIX = ".sagemaker-uploading"
@@ -42,29 +43,33 @@ def train(train_args, checkpoint_dir):
4243

4344
xgb_model, start_iteration = load_checkpoint(checkpoint_dir)
4445

46+
# xgboost's default value for num_boost_round is 10.
47+
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training
48+
# If num_boost_round <= 0, xgb.train() doesn't actually train and
49+
# immediately returns a Booster object.
50+
train_args["num_boost_round"] = train_args.get("num_boost_round", 10) - start_iteration
51+
4552
if xgb_model is not None:
4653
logging.info("Checkpoint loaded from %s", xgb_model)
4754
logging.info("Resuming from iteration %s", start_iteration)
4855

4956
callbacks = train_args.get("callbacks", [])
50-
callbacks.append(print_checkpointed_evaluation(start_iteration=start_iteration))
51-
callbacks.append(save_checkpoint(checkpoint_dir, start_iteration=start_iteration))
57+
callbacks.append(print_checkpointed_evaluation(start_iteration=start_iteration,
58+
end_iteration=train_args["num_boost_round"]))
59+
callbacks.append(save_checkpoint(checkpoint_dir, start_iteration=start_iteration, iteration=start_iteration,
60+
end_iteration=train_args["num_boost_round"]))
5261

5362
train_args["verbose_eval"] = False # suppress xgboost's print_evaluation()
5463
train_args["xgb_model"] = xgb_model
5564
train_args["callbacks"] = callbacks
56-
# xgboost's default value for num_boost_round is 10.
57-
# If num_boost_round <= 0, xgb.train() doesn't actually train and
58-
# immediately returns a Booster object.
59-
train_args["num_boost_round"] = train_args.get("num_boost_round", 10) - start_iteration
6065

6166
booster = xgb.train(**train_args)
6267

6368
return booster
6469

6570

66-
def print_checkpointed_evaluation(period=1, show_stdv=True, start_iteration=0):
67-
"""Create a callback that print evaluation result.
71+
class PrintCheckpoint(xgb.callback.TrainingCallback):
72+
"""Create a callback that print evaluation result every period iteration.
6873
6974
This function was modified from https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/callback.py
7075
The only difference between the following function and the original function in xgboost.callback
@@ -73,41 +78,62 @@ def print_checkpointed_evaluation(period=1, show_stdv=True, start_iteration=0):
7378
We print the evaluation results every **period** iterations
7479
and on the first and the last iterations.
7580
76-
Parameters
81+
Attributes
7782
----------
7883
period : int
79-
The period to log the evaluation results
80-
84+
The period to log the evaluation results
8185
show_stdv : bool, optional
82-
Whether show stdv if provided
83-
86+
Whether show stdv if provided
8487
start_iteration: int, optioonal
85-
Used for offsetting the iteratoin number that appears at the beginning of each evaluation result in the logs.
86-
87-
Returns
88-
-------
89-
callback : function
90-
A callback that print evaluation every period iterations.
88+
Used for offsetting the iteratoin number that appears at the beginning of each evaluation result in the logs.
9189
"""
9290

93-
def callback(env):
94-
"""internal function"""
95-
if env.rank != 0 or (not env.evaluation_result_list) or period is False or period == 0:
96-
return
97-
i = env.iteration
98-
if i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration:
99-
msg = "\t".join([format_metric(x, show_stdv) for x in env.evaluation_result_list])
100-
rabit.tracker_print("[%d]\t%s\n" % (i + start_iteration, msg))
91+
def __init__(self, end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0):
92+
self.period = period
93+
self.show_stdv = show_stdv
94+
self.start_iteration = start_iteration
95+
self.rank = rank
96+
self.iteration = iteration
97+
self.end_iteration = end_iteration
10198

102-
return callback
99+
def __call__(self, model, epoch=0, evals_log=None):
100+
return self.after_iteration(model, epoch, evals_log)
101+
102+
def after_iteration(self, model, epoch=0, evals_log=None):
103+
if self.rank != 0 or (not evals_log) or self.period is False or self.period == 0:
104+
return
105+
i = self.iteration
106+
if i % self.period == 0 or i + 1 == self.start_iteration or i + 1 == self.end_iteration:
107+
evaluation_monitor = EvaluationMonitor(self.rank, self.period, self.show_stdv)
108+
msg: str = ""
109+
for data, metric in evals_log.items():
110+
for metric_name, log in metric.items():
111+
stdv: Optional[float] = None
112+
if isinstance(log[-1], tuple):
113+
score = log[-1][0]
114+
stdv = log[-1][1]
115+
else:
116+
score = log[-1]
117+
msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv)
118+
msg += "\n"
119+
rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg))
120+
121+
122+
def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0):
123+
"""A callback function that print evaluation result every period iteration.
124+
125+
This is a wrapper function around PrintCheckpoint.
126+
For details, see PrintCheckpoint.
127+
"""
128+
return PrintCheckpoint(end_iteration, iteration, rank, period, show_stdv, start_iteration)
103129

104130

105131
def load_checkpoint(checkpoint_dir, max_try=5):
106132
"""
107133
:param checkpoint_dir: e.g., /opt/ml/checkpoints
108134
:param max_try: number of times to try loading checkpoint before giving up.
109135
:return xgb_model: file path of stored xgb model. None if no checkpoint.
110-
:return iteration: iterations completed before last checkpoiint.
136+
:return iteration: iterations completed before last checkpoint.
111137
"""
112138
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
113139
return None, 0
@@ -124,9 +150,6 @@ def load_checkpoint(checkpoint_dir, max_try=5):
124150
try:
125151
latest_checkpoint = checkpoints.pop()
126152
xgb_model = os.path.join(checkpoint_dir, latest_checkpoint)
127-
booster = Booster()
128-
booster.load_model(xgb_model)
129-
130153
filename, extension = latest_checkpoint.split(".")
131154
iteration = int(extension) + 1
132155
break
@@ -141,18 +164,20 @@ def _sort_checkpoints(checkpoint_files):
141164
return checkpoint_files
142165

143166

144-
def save_checkpoint(checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None):
167+
def save_checkpoint(checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0,
168+
end_iteration=None):
145169
"""A callback function that saves checkpoints to disk.
146170
147171
This is a wrapper function around SaveCheckpoint.
148172
For details, see SaveCheckpoint.
149173
"""
150-
return SaveCheckpoint(
151-
checkpoint_dir=checkpoint_dir, start_iteration=start_iteration, max_to_keep=max_to_keep, num_round=num_round
174+
return SaveCheckpointCallBack(
175+
checkpoint_dir=checkpoint_dir, start_iteration=start_iteration, max_to_keep=max_to_keep, num_round=num_round,
176+
iteration=iteration, end_iteration=end_iteration
152177
)
153178

154179

155-
class SaveCheckpoint(object):
180+
class SaveCheckpointCallBack(xgb.callback.TrainingCallback):
156181
"""Create a callback that saves checkpoints to disk.
157182
158183
The main purpose of this class is to support checkpointing for managed spot
@@ -192,19 +217,23 @@ class SaveCheckpoint(object):
192217
after round 19, start_iteration will be 20).
193218
num_round: (optional) indicates the number of boosting rounds.
194219
195-
Example:
196-
>>> save_checkpoint = SaveCheckpoint("/opt/ml/checkpoints")
197-
>>> xgboost.train(prams, dtrain, callbacks=[save_checkpoint])
198-
"""
220+
Example:
221+
>>> save_checkpoint = SaveCheckpoint("/opt/ml/checkpoints")
222+
>>> xgboost.train(prams, dtrain, callbacks=[save_checkpoint])
223+
"""
199224

200225
SENTINEL = None
201226

202-
def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None):
227+
def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0,
228+
end_iteration=None):
203229
"""Init SaveCheckpoint with checkpoint_dir"""
204230
self.checkpoint_dir = checkpoint_dir
205231
self.max_to_keep = max_to_keep
206232
self.start_iteration = start_iteration
207233
self.num_round = num_round
234+
self.rank = rank
235+
self.iteration = iteration
236+
self.end_iteration = end_iteration
208237

209238
if not os.path.exists(self.checkpoint_dir):
210239
os.makedirs(self.checkpoint_dir)
@@ -215,16 +244,46 @@ def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=N
215244

216245
self.start()
217246

218-
def __call__(self, env):
247+
def __call__(self, model, epoch=0, evals_log=None):
219248
"""Make the class callable since it is meant be used as a callback"""
220-
return self.callback(env)
249+
return self.after_iteration(model, epoch, evals_log)
221250

222251
def format_path(self, iteration):
223252
"""Return a file path to checkpoint given a iteration number"""
224253
filename = "{}.{}".format(CHECKPOINT_FILENAME, iteration)
225254
checkpoint_path = os.path.join(self.checkpoint_dir, filename)
226255
return checkpoint_path
227256

257+
def after_iteration(self, model, epoch=0, evals_log=None) -> bool:
258+
# rank: master node has rank 0.
259+
# iteration: current boosting round
260+
# end_iteration: round # when training will end. this is always num_round + 1.
261+
# model: model object
262+
if self.rank != 0:
263+
logger.debug("Not master (rank = %d). Exiting checkpoint callback.", self.rank)
264+
return
265+
266+
if len(os.listdir(self.checkpoint_dir)) != 0:
267+
xgb_model, self.iteration = load_checkpoint(self.checkpoint_dir)
268+
current_iteration = self.iteration
269+
else:
270+
current_iteration = self.start_iteration + self.iteration
271+
self._save_checkpoint(model, current_iteration)
272+
273+
# For example, if we are at iteration 5 and max_to_keep is 5, we no
274+
# longer need checkpoint from iteration 0 (i.e., xgboost-checkpoint.0),
275+
# so we put iteration_to_delete = 0 on the queue.
276+
iteration_to_delete = current_iteration - self.max_to_keep
277+
self.delete_queue.put(iteration_to_delete)
278+
279+
offset_iteration = self.end_iteration if self.num_round is None else self.num_round
280+
281+
training_has_ended = current_iteration + 1 >= self.start_iteration + offset_iteration
282+
283+
if training_has_ended:
284+
self.stop()
285+
return False
286+
228287
def start(self):
229288
"""Start a background thread that deletes old checkpoints
230289
@@ -236,7 +295,6 @@ def start(self):
236295
When training is complete, we put SENTINEL on the queue, and when we
237296
see the SENTINEL, we clean up and exit the thread.
238297
"""
239-
240298
def _is_uploading(path):
241299
uploading = os.path.isfile(path + FILE_LOCK_SUFFIX)
242300
uploaded = os.path.isfile(path + FILE_SAFE_SUFFIX)
@@ -286,7 +344,9 @@ def _delete_uploaded_files_and_cleanup():
286344
_delete_uploaded_files()
287345
_cleanup()
288346

289-
self.thread = threading.Thread(target=_delete_uploaded_files_and_cleanup, daemon=True)
347+
self.thread = threading.Thread(
348+
target=_delete_uploaded_files_and_cleanup,
349+
daemon=True)
290350
self.thread.start()
291351

292352
def stop(self):
@@ -304,30 +364,6 @@ def _save_checkpoint(self, model, iteration):
304364
save_file_path = self.format_path(iteration)
305365
os.rename(tf.name, save_file_path)
306366

307-
def callback(self, env):
308-
# env.rank: rabit rank of the node/process. master node has rank 0.
309-
# env.iteration: current boosting round
310-
# env.begin_iteration: round # when training started. this is always 0.
311-
# env.end_iteration: round # when training will end. this is always num_round + 1.
312-
# env.model: model object
313-
if env.rank != 0:
314-
logger.debug("Not master (rank = %d). Exiting checkpoint callback.", env.rank)
315-
return
316-
317-
current_iteration = self.start_iteration + env.iteration
318-
self._save_checkpoint(env.model, current_iteration)
319-
320-
# For example, if we are at iteration 5 and max_to_keep is 5, we no
321-
# longer need checkpoint from iteration 0 (i.e., xgboost-checkpoint.0),
322-
# so we put iteration_to_delete = 0 on the queue.
323-
iteration_to_delete = current_iteration - self.max_to_keep
324-
self.delete_queue.put(iteration_to_delete)
325-
326-
offset_iteration = env.end_iteration if self.num_round is None else self.num_round
327-
training_has_ended = current_iteration + 1 >= self.start_iteration + offset_iteration
328-
if training_has_ended:
329-
self.stop()
330-
331367

332368
def save_intermediate_model(intermediate_model_dir, model_name):
333369
"""A callback function that saves intermediate models to disk.

test/resources/versions/train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
boto3==1.17.52
1212
botocore==1.20.52
1313
conda==4.10.1
14-
cryptography==35.0.0
14+
cryptography==39.0.1
1515
gunicorn==19.10.0
1616
matplotlib==3.4.1
1717
multi-model-server==1.1.2

0 commit comments

Comments
 (0)