Skip to content

Commit 1782329

Browse files
authored
fix: remove historical job_name caching which causes long job name (#5118)
1 parent 0a86e60 commit 1782329

File tree

3 files changed

+7
-50
lines changed

3 files changed

+7
-50
lines changed

Diff for: src/sagemaker/workflow/steps.py

+1-44
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from enum import Enum
2020
from typing import Dict, List, Set, Union, Optional, Any, TYPE_CHECKING
21-
from urllib.parse import urlparse
2221

2322
import attr
2423

@@ -465,6 +464,7 @@ def __init__(
465464
self.step_args = step_args
466465
self.estimator = estimator
467466
self.inputs = inputs
467+
self.job_name = None
468468

469469
self._properties = Properties(
470470
step_name=name, step=self, shape_name="DescribeTrainingJobResponse"
@@ -493,19 +493,6 @@ def __init__(
493493
DeprecationWarning,
494494
)
495495

496-
self.job_name = None
497-
if estimator and (estimator.source_dir or estimator.entry_point):
498-
# By default, `Estimator` will upload the local code to an S3 path
499-
# containing a timestamp. This causes cache misses whenever a
500-
# pipeline is updated, even if the underlying script hasn't changed.
501-
# To avoid this, hash the contents of the training script and include it
502-
# in the `job_name` passed to the `Estimator`, which will be used
503-
# instead of the timestamped path.
504-
if not is_pipeline_variable(estimator.source_dir) and not is_pipeline_variable(
505-
estimator.entry_point
506-
):
507-
self.job_name = self._generate_code_upload_path()
508-
509496
@property
510497
def arguments(self) -> RequestType:
511498
"""The arguments dictionary that is used to call `create_training_job`.
@@ -554,26 +541,6 @@ def to_request(self) -> RequestType:
554541

555542
return request_dict
556543

557-
def _generate_code_upload_path(self) -> str or None:
558-
"""Generate an upload path for local training scripts based on their content."""
559-
from sagemaker.workflow.utilities import hash_files_or_dirs
560-
561-
if self.estimator.source_dir:
562-
source_dir_url = urlparse(self.estimator.source_dir)
563-
if source_dir_url.scheme == "" or source_dir_url.scheme == "file":
564-
code_hash = hash_files_or_dirs(
565-
[self.estimator.source_dir] + self.estimator.dependencies
566-
)
567-
return f"{self.name}-{code_hash}"[:1024]
568-
elif self.estimator.entry_point:
569-
entry_point_url = urlparse(self.estimator.entry_point)
570-
if entry_point_url.scheme == "" or entry_point_url.scheme == "file":
571-
code_hash = hash_files_or_dirs(
572-
[self.estimator.entry_point] + self.estimator.dependencies
573-
)
574-
return f"{self.name}-{code_hash}"[:1024]
575-
return None
576-
577544

578545
class CreateModelStep(ConfigurableRetryStep):
579546
"""`CreateModelStep` for SageMaker Pipelines Workflows."""
@@ -895,16 +862,6 @@ def __init__(
895862
"code argument has to be a valid S3 URI or local file path "
896863
+ "rather than a pipeline variable"
897864
)
898-
code_url = urlparse(code)
899-
if code_url.scheme == "" or code_url.scheme == "file":
900-
# By default, `Processor` will upload the local code to an S3 path
901-
# containing a timestamp. This causes cache misses whenever a
902-
# pipeline is updated, even if the underlying script hasn't changed.
903-
# To avoid this, hash the contents of the script and include it
904-
# in the `job_name` passed to the `Processor`, which will be used
905-
# instead of the timestamped path.
906-
self.job_name = self._generate_code_upload_path()
907-
908865
warnings.warn(
909866
(
910867
'We are deprecating the instantiation of ProcessingStep using "processor".'

Diff for: tests/unit/sagemaker/workflow/test_steps.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def test_processing_step_normalizes_args_with_local_code(mock_normalize_args, sc
671671
mock_normalize_args.return_value = [step.inputs, step.outputs]
672672
step.to_request()
673673
mock_normalize_args.assert_called_with(
674-
job_name="MyProcessingStep-a22fc59b38f13da26f6a40b18687ba598cf669f74104b793cefd9c63eddf4ac7",
674+
job_name=None,
675675
arguments=step.job_arguments,
676676
inputs=step.inputs,
677677
outputs=step.outputs,

Diff for: tests/unit/sagemaker/workflow/test_utils.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,11 @@ def test_repack_model_step(estimator):
8080
assert hyperparameters["inference_script"] == '"dummy_script.py"'
8181
assert hyperparameters["model_archive"] == '"s3://my-bucket/model.tar.gz"'
8282
assert hyperparameters["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"'
83-
assert (
84-
hyperparameters["sagemaker_submit_directory"]
85-
== '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e\
86-
/source/sourcedir.tar.gz"'
87-
)
83+
84+
# ex: "gits3://my-bucket/sagemaker-scikit-learn-2025-04-07-20-39-38-854/source/sourcedir.tar.gz"
85+
sagemaker_submit_directory = hyperparameters["sagemaker_submit_directory"]
86+
assert sagemaker_submit_directory.startswith('"s3://my-bucket/sagemaker-scikit-learn-')
87+
assert sagemaker_submit_directory.endswith('/source/sourcedir.tar.gz"')
8888

8989
del request_dict["Arguments"]["HyperParameters"]
9090
del request_dict["Arguments"]["AlgorithmSpecification"]["TrainingImage"]

0 commit comments

Comments
 (0)