Skip to content

Commit

Permalink
remove option
Browse files Browse the repository at this point in the history
  • Loading branch information
pintaoz-aws committed Jan 13, 2025
1 parent 12937f3 commit a58654e
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 140 deletions.
16 changes: 6 additions & 10 deletions src/sagemaker/modules/local_core/local_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,12 @@ def model_post_init(self, __context: Any):
def train(
self,
wait: bool,
remove_inputs_and_container_artifacts: Optional[bool] = True,
) -> str:
"""Run a training job locally using docker-compose.
Args:
wait (bool):
Whether to wait the training output before exiting.
remove_inputs_and_container_artifacts (Optional[bool]):
Whether to remove inputs and container artifacts after training.
"""
# create output/data folder since sagemaker-containers 2.0 expects it
os.makedirs(os.path.join(self.container_root, "output", "data"), exist_ok=True)
Expand Down Expand Up @@ -207,13 +204,12 @@ def train(
# Print our Job Complete line
logger.info("Local training job completed, output artifacts saved to %s", artifacts)

if remove_inputs_and_container_artifacts:
shutil.rmtree(os.path.join(self.container_root, "input"))
shutil.rmtree(os.path.join(self.container_root, "shared"))
for host in self.hosts:
shutil.rmtree(os.path.join(self.container_root, host))
for folder in self._temporary_folders:
shutil.rmtree(os.path.join(self.container_root, folder))
shutil.rmtree(os.path.join(self.container_root, "input"))
shutil.rmtree(os.path.join(self.container_root, "shared"))
for host in self.hosts:
shutil.rmtree(os.path.join(self.container_root, host))
for folder in self._temporary_folders:
shutil.rmtree(os.path.join(self.container_root, folder))
return artifacts

def retrieve_artifacts(
Expand Down
5 changes: 1 addition & 4 deletions src/sagemaker/modules/train/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,6 @@ class ModelTrainer(BaseModel):
local_container_root (Optional[str]):
The local root directory to store artifacts from a training job launched in
"LOCAL_CONTAINER" mode.
remove_inputs_and_container_artifacts (Optional[bool]):
Whether to remove inputs and container artifacts after training.
"""

model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
Expand All @@ -229,7 +227,6 @@ class ModelTrainer(BaseModel):
hyperparameters: Optional[Dict[str, Any]] = {}
tags: Optional[List[Tag]] = None
local_container_root: Optional[str] = os.getcwd()
remove_inputs_and_container_artifacts: Optional[bool] = True

# Created Artifacts
_latest_training_job: Optional[resources.TrainingJob] = PrivateAttr(default=None)
Expand Down Expand Up @@ -649,7 +646,7 @@ def train(
hyper_parameters=string_hyper_parameters,
environment=self.environment,
)
local_container.train(wait, self.remove_inputs_and_container_artifacts)
local_container.train(wait)

def create_input_data_channel(
self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None
Expand Down
128 changes: 2 additions & 126 deletions tests/integ/sagemaker/modules/train/test_local_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def test_single_container_local_mode_local_data(modules_sagemaker_session):
delete_local_path(path)


def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_session):
def test_single_container_local_mode_s3_data(modules_sagemaker_session):
with lock.lock(LOCK_PATH):
try:
# upload local data to s3
Expand Down Expand Up @@ -163,69 +163,7 @@ def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_sess
delete_local_path(path)


def test_single_container_local_mode_s3_data_not_remove_input(modules_sagemaker_session):
with lock.lock(LOCK_PATH):
try:
# upload local data to s3
session = modules_sagemaker_session
bucket = session.default_bucket()
session.upload_data(
path=os.path.join(SOURCE_DIR, "data/train/"),
bucket=bucket,
key_prefix="data/train",
)
session.upload_data(
path=os.path.join(SOURCE_DIR, "data/test/"),
bucket=bucket,
key_prefix="data/test",
)

source_code = SourceCode(
source_dir=SOURCE_DIR,
entry_script="local_training_script.py",
)

compute = Compute(
instance_type="local_cpu",
instance_count=1,
)

# read input data from s3
train_data = InputData(channel_name="train", data_source=f"s3://{bucket}/data/train/")

test_data = InputData(channel_name="test", data_source=f"s3://{bucket}/data/test/")

model_trainer = ModelTrainer(
training_image=DEFAULT_CPU_IMAGE,
sagemaker_session=modules_sagemaker_session,
source_code=source_code,
compute=compute,
input_data_config=[train_data, test_data],
base_job_name="local_mode_single_container_s3_data",
training_mode=Mode.LOCAL_CONTAINER,
remove_inputs_and_container_artifacts=False,
)

model_trainer.train()
assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz"))
finally:
subprocess.run(["docker", "compose", "down", "-v"])
directories = [
"compressed_artifacts",
"artifacts",
"model",
"shared",
"input",
"output",
"algo-1",
]

for directory in directories:
path = os.path.join(CWD, directory)
delete_local_path(path)


def test_multi_container_local_mode_remove_input(modules_sagemaker_session):
def test_multi_container_local_mode(modules_sagemaker_session):
with lock.lock(LOCK_PATH):
try:
source_code = SourceCode(
Expand Down Expand Up @@ -284,65 +222,3 @@ def test_multi_container_local_mode_remove_input(modules_sagemaker_session):
for directory in directories:
path = os.path.join(CWD, directory)
delete_local_path(path)


def test_multi_container_local_mode_not_remove_input(modules_sagemaker_session):
with lock.lock(LOCK_PATH):
try:
source_code = SourceCode(
source_dir=SOURCE_DIR,
entry_script="local_training_script.py",
)

distributed = Torchrun(
process_count_per_node=1,
)

compute = Compute(
instance_type="local_cpu",
instance_count=2,
)

train_data = InputData(
channel_name="train",
data_source=os.path.join(SOURCE_DIR, "data/train/"),
)

test_data = InputData(
channel_name="test",
data_source=os.path.join(SOURCE_DIR, "data/test/"),
)

model_trainer = ModelTrainer(
training_image=DEFAULT_CPU_IMAGE,
sagemaker_session=modules_sagemaker_session,
source_code=source_code,
distributed=distributed,
compute=compute,
input_data_config=[train_data, test_data],
base_job_name="local_mode_multi_container",
training_mode=Mode.LOCAL_CONTAINER,
remove_inputs_and_container_artifacts=False,
)

model_trainer.train()
assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz"))
assert os.path.exists(os.path.join(CWD, "algo-1"))
assert os.path.exists(os.path.join(CWD, "algo-2"))

finally:
subprocess.run(["docker", "compose", "down", "-v"])
directories = [
"compressed_artifacts",
"artifacts",
"model",
"shared",
"input",
"output",
"algo-1",
"algo-2",
]

for directory in directories:
path = os.path.join(CWD, directory)
delete_local_path(path)

0 comments on commit a58654e

Please sign in to comment.