From a58654ecbd5997aa59e4fcd8389715fe04c71f5d Mon Sep 17 00:00:00 2001 From: pintaoz-aws Date: Sun, 29 Dec 2024 17:34:08 -0800 Subject: [PATCH] remove option --- .../modules/local_core/local_container.py | 16 +-- src/sagemaker/modules/train/model_trainer.py | 5 +- .../modules/train/test_local_model_trainer.py | 128 +----------------- 3 files changed, 9 insertions(+), 140 deletions(-) diff --git a/src/sagemaker/modules/local_core/local_container.py b/src/sagemaker/modules/local_core/local_container.py index 3cfb52e626..448330092d 100644 --- a/src/sagemaker/modules/local_core/local_container.py +++ b/src/sagemaker/modules/local_core/local_container.py @@ -148,15 +148,12 @@ def model_post_init(self, __context: Any): def train( self, wait: bool, - remove_inputs_and_container_artifacts: Optional[bool] = True, ) -> str: """Run a training job locally using docker-compose. Args: wait (bool): Whether to wait the training output before exiting. - remove_inputs_and_container_artifacts (Optional[bool]): - Whether to remove inputs and container artifacts after training. """ # create output/data folder since sagemaker-containers 2.0 expects it os.makedirs(os.path.join(self.container_root, "output", "data"), exist_ok=True) @@ -207,13 +204,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - if remove_inputs_and_container_artifacts: - shutil.rmtree(os.path.join(self.container_root, "input")) - shutil.rmtree(os.path.join(self.container_root, "shared")) - for host in self.hosts: - shutil.rmtree(os.path.join(self.container_root, host)) - for folder in self._temporary_folders: - shutil.rmtree(os.path.join(self.container_root, folder)) + shutil.rmtree(os.path.join(self.container_root, "input")) + shutil.rmtree(os.path.join(self.container_root, "shared")) + for host in self.hosts: + shutil.rmtree(os.path.join(self.container_root, host)) + for folder in self._temporary_folders: + shutil.rmtree(os.path.join(self.container_root, folder)) return artifacts def retrieve_artifacts( diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index b603fafd3f..31decfaca9 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -203,8 +203,6 @@ class ModelTrainer(BaseModel): local_container_root (Optional[str]): The local root directory to store artifacts from a training job launched in "LOCAL_CONTAINER" mode. - remove_inputs_and_container_artifacts (Optional[bool]): - Whether to remove inputs and container artifacts after training. """ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid") @@ -229,7 +227,6 @@ class ModelTrainer(BaseModel): hyperparameters: Optional[Dict[str, Any]] = {} tags: Optional[List[Tag]] = None local_container_root: Optional[str] = os.getcwd() - remove_inputs_and_container_artifacts: Optional[bool] = True # Created Artifacts _latest_training_job: Optional[resources.TrainingJob] = PrivateAttr(default=None) @@ -649,7 +646,7 @@ def train( hyper_parameters=string_hyper_parameters, environment=self.environment, ) - local_container.train(wait, self.remove_inputs_and_container_artifacts) + local_container.train(wait) def create_input_data_channel( self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None diff --git a/tests/integ/sagemaker/modules/train/test_local_model_trainer.py b/tests/integ/sagemaker/modules/train/test_local_model_trainer.py index 9d8fcd8889..7947b2fc87 100644 --- a/tests/integ/sagemaker/modules/train/test_local_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_local_model_trainer.py @@ -100,7 +100,7 @@ def test_single_container_local_mode_local_data(modules_sagemaker_session): delete_local_path(path) -def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_session): +def test_single_container_local_mode_s3_data(modules_sagemaker_session): with lock.lock(LOCK_PATH): try: # upload local data to s3 @@ -163,69 +163,7 @@ def test_single_container_local_mode_s3_data_remove_input(modules_sagemaker_sess delete_local_path(path) -def test_single_container_local_mode_s3_data_not_remove_input(modules_sagemaker_session): - with lock.lock(LOCK_PATH): - try: - # upload local data to s3 - session = modules_sagemaker_session - bucket = session.default_bucket() - session.upload_data( - path=os.path.join(SOURCE_DIR, "data/train/"), - bucket=bucket, - key_prefix="data/train", - ) - session.upload_data( - path=os.path.join(SOURCE_DIR, "data/test/"), - bucket=bucket, - key_prefix="data/test", - ) - - source_code = SourceCode( - source_dir=SOURCE_DIR, - entry_script="local_training_script.py", - ) - - compute = Compute( - instance_type="local_cpu", - instance_count=1, - ) - - # read input data from s3 - train_data = InputData(channel_name="train", data_source=f"s3://{bucket}/data/train/") - - test_data = InputData(channel_name="test", data_source=f"s3://{bucket}/data/test/") - - model_trainer = ModelTrainer( - training_image=DEFAULT_CPU_IMAGE, - sagemaker_session=modules_sagemaker_session, - source_code=source_code, - compute=compute, - input_data_config=[train_data, test_data], - base_job_name="local_mode_single_container_s3_data", - training_mode=Mode.LOCAL_CONTAINER, - remove_inputs_and_container_artifacts=False, - ) - - model_trainer.train() - assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz")) - finally: - subprocess.run(["docker", "compose", "down", "-v"]) - directories = [ - "compressed_artifacts", - "artifacts", - "model", - "shared", - "input", - "output", - "algo-1", - ] - - for directory in directories: - path = os.path.join(CWD, directory) - delete_local_path(path) - - -def test_multi_container_local_mode_remove_input(modules_sagemaker_session): +def test_multi_container_local_mode(modules_sagemaker_session): with lock.lock(LOCK_PATH): try: source_code = SourceCode( @@ -284,65 +222,3 @@ def test_multi_container_local_mode_remove_input(modules_sagemaker_session): for directory in directories: path = os.path.join(CWD, directory) delete_local_path(path) - - -def test_multi_container_local_mode_not_remove_input(modules_sagemaker_session): - with lock.lock(LOCK_PATH): - try: - source_code = SourceCode( - source_dir=SOURCE_DIR, - entry_script="local_training_script.py", - ) - - distributed = Torchrun( - process_count_per_node=1, - ) - - compute = Compute( - instance_type="local_cpu", - instance_count=2, - ) - - train_data = InputData( - channel_name="train", - data_source=os.path.join(SOURCE_DIR, "data/train/"), - ) - - test_data = InputData( - channel_name="test", - data_source=os.path.join(SOURCE_DIR, "data/test/"), - ) - - model_trainer = ModelTrainer( - training_image=DEFAULT_CPU_IMAGE, - sagemaker_session=modules_sagemaker_session, - source_code=source_code, - distributed=distributed, - compute=compute, - input_data_config=[train_data, test_data], - base_job_name="local_mode_multi_container", - training_mode=Mode.LOCAL_CONTAINER, - remove_inputs_and_container_artifacts=False, - ) - - model_trainer.train() - assert os.path.exists(os.path.join(CWD, "compressed_artifacts/model.tar.gz")) - assert os.path.exists(os.path.join(CWD, "algo-1")) - assert os.path.exists(os.path.join(CWD, "algo-2")) - - finally: - subprocess.run(["docker", "compose", "down", "-v"]) - directories = [ - "compressed_artifacts", - "artifacts", - "model", - "shared", - "input", - "output", - "algo-1", - "algo-2", - ] - - for directory in directories: - path = os.path.join(CWD, directory) - delete_local_path(path)