Skip to content

Commit a58654e

Browse files
committed
remove option
1 parent 12937f3 commit a58654e

File tree

3 files changed

+9
-140
lines changed

3 files changed

+9
-140
lines changed

src/sagemaker/modules/local_core/local_container.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -148,15 +148,12 @@ def model_post_init(self, __context: Any):
148148
def train(
149149
self,
150150
wait: bool,
151-
remove_inputs_and_container_artifacts: Optional[bool] = True,
152151
) -> str:
153152
"""Run a training job locally using docker-compose.
154153
155154
Args:
156155
wait (bool):
157156
Whether to wait the training output before exiting.
158-
remove_inputs_and_container_artifacts (Optional[bool]):
159-
Whether to remove inputs and container artifacts after training.
160157
"""
161158
# create output/data folder since sagemaker-containers 2.0 expects it
162159
os.makedirs(os.path.join(self.container_root, "output", "data"), exist_ok=True)
@@ -207,13 +204,12 @@ def train(
207204
# Print our Job Complete line
208205
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
209206

210-
if remove_inputs_and_container_artifacts:
211-
shutil.rmtree(os.path.join(self.container_root, "input"))
212-
shutil.rmtree(os.path.join(self.container_root, "shared"))
213-
for host in self.hosts:
214-
shutil.rmtree(os.path.join(self.container_root, host))
215-
for folder in self._temporary_folders:
216-
shutil.rmtree(os.path.join(self.container_root, folder))
207+
shutil.rmtree(os.path.join(self.container_root, "input"))
208+
shutil.rmtree(os.path.join(self.container_root, "shared"))
209+
for host in self.hosts:
210+
shutil.rmtree(os.path.join(self.container_root, host))
211+
for folder in self._temporary_folders:
212+
shutil.rmtree(os.path.join(self.container_root, folder))
217213
return artifacts
218214

219215
def retrieve_artifacts(

src/sagemaker/modules/train/model_trainer.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,6 @@ class ModelTrainer(BaseModel):
203203
local_container_root (Optional[str]):
204204
The local root directory to store artifacts from a training job launched in
205205
"LOCAL_CONTAINER" mode.
206-
remove_inputs_and_container_artifacts (Optional[bool]):
207-
Whether to remove inputs and container artifacts after training.
208206
"""
209207

210208
model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
@@ -229,7 +227,6 @@ class ModelTrainer(BaseModel):
229227
hyperparameters: Optional[Dict[str, Any]] = {}
230228
tags: Optional[List[Tag]] = None
231229
local_container_root: Optional[str] = os.getcwd()
232-
remove_inputs_and_container_artifacts: Optional[bool] = True
233230

234231
# Created Artifacts
235232
_latest_training_job: Optional[resources.TrainingJob] = PrivateAttr(default=None)
@@ -649,7 +646,7 @@ def train(
649646
hyper_parameters=string_hyper_parameters,
650647
environment=self.environment,
651648
)
652-
local_container.train(wait, self.remove_inputs_and_container_artifacts)
649+
local_container.train(wait)
653650

654651
def create_input_data_channel(
655652
self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None

tests/integ/sagemaker/modules/train/test_local_model_trainer.py

+2-126
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def test_single_container_local_mode_local_data(modules_sagemaker_session):
100100
delete_local_path(path)
101101

102102

103-
def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_session):
103+
def test_single_container_local_mode_s3_data(modules_sagemaker_session):
104104
with lock.lock(LOCK_PATH):
105105
try:
106106
# upload local data to s3
@@ -163,69 +163,7 @@ def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_sess
163163
delete_local_path(path)
164164

165165

166-
def test_single_container_local_mode_s3_data_not_remove_input(modules_sagemaker_session):
167-
with lock.lock(LOCK_PATH):
168-
try:
169-
# upload local data to s3
170-
session = modules_sagemaker_session
171-
bucket = session.default_bucket()
172-
session.upload_data(
173-
path=os.path.join(SOURCE_DIR, "data/train/"),
174-
bucket=bucket,
175-
key_prefix="data/train",
176-
)
177-
session.upload_data(
178-
path=os.path.join(SOURCE_DIR, "data/test/"),
179-
bucket=bucket,
180-
key_prefix="data/test",
181-
)
182-
183-
source_code = SourceCode(
184-
source_dir=SOURCE_DIR,
185-
entry_script="local_training_script.py",
186-
)
187-
188-
compute = Compute(
189-
instance_type="local_cpu",
190-
instance_count=1,
191-
)
192-
193-
# read input data from s3
194-
train_data = InputData(channel_name="train", data_source=f"s3://{bucket}/data/train/")
195-
196-
test_data = InputData(channel_name="test", data_source=f"s3://{bucket}/data/test/")
197-
198-
model_trainer = ModelTrainer(
199-
training_image=DEFAULT_CPU_IMAGE,
200-
sagemaker_session=modules_sagemaker_session,
201-
source_code=source_code,
202-
compute=compute,
203-
input_data_config=[train_data, test_data],
204-
base_job_name="local_mode_single_container_s3_data",
205-
training_mode=Mode.LOCAL_CONTAINER,
206-
remove_inputs_and_container_artifacts=False,
207-
)
208-
209-
model_trainer.train()
210-
assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz"))
211-
finally:
212-
subprocess.run(["docker", "compose", "down", "-v"])
213-
directories = [
214-
"compressed_artifacts",
215-
"artifacts",
216-
"model",
217-
"shared",
218-
"input",
219-
"output",
220-
"algo-1",
221-
]
222-
223-
for directory in directories:
224-
path = os.path.join(CWD, directory)
225-
delete_local_path(path)
226-
227-
228-
def test_multi_container_local_mode_remove_input(modules_sagemaker_session):
166+
def test_multi_container_local_mode(modules_sagemaker_session):
229167
with lock.lock(LOCK_PATH):
230168
try:
231169
source_code = SourceCode(
@@ -284,65 +222,3 @@ def test_multi_container_local_mode_remove_input(modules_sagemaker_session):
284222
for directory in directories:
285223
path = os.path.join(CWD, directory)
286224
delete_local_path(path)
287-
288-
289-
def test_multi_container_local_mode_not_remove_input(modules_sagemaker_session):
290-
with lock.lock(LOCK_PATH):
291-
try:
292-
source_code = SourceCode(
293-
source_dir=SOURCE_DIR,
294-
entry_script="local_training_script.py",
295-
)
296-
297-
distributed = Torchrun(
298-
process_count_per_node=1,
299-
)
300-
301-
compute = Compute(
302-
instance_type="local_cpu",
303-
instance_count=2,
304-
)
305-
306-
train_data = InputData(
307-
channel_name="train",
308-
data_source=os.path.join(SOURCE_DIR, "data/train/"),
309-
)
310-
311-
test_data = InputData(
312-
channel_name="test",
313-
data_source=os.path.join(SOURCE_DIR, "data/test/"),
314-
)
315-
316-
model_trainer = ModelTrainer(
317-
training_image=DEFAULT_CPU_IMAGE,
318-
sagemaker_session=modules_sagemaker_session,
319-
source_code=source_code,
320-
distributed=distributed,
321-
compute=compute,
322-
input_data_config=[train_data, test_data],
323-
base_job_name="local_mode_multi_container",
324-
training_mode=Mode.LOCAL_CONTAINER,
325-
remove_inputs_and_container_artifacts=False,
326-
)
327-
328-
model_trainer.train()
329-
assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz"))
330-
assert os.path.exists(os.path.join(CWD, "algo-1"))
331-
assert os.path.exists(os.path.join(CWD, "algo-2"))
332-
333-
finally:
334-
subprocess.run(["docker", "compose", "down", "-v"])
335-
directories = [
336-
"compressed_artifacts",
337-
"artifacts",
338-
"model",
339-
"shared",
340-
"input",
341-
"output",
342-
"algo-1",
343-
"algo-2",
344-
]
345-
346-
for directory in directories:
347-
path = os.path.join(CWD, directory)
348-
delete_local_path(path)

0 commit comments

Comments
 (0)