From 63464edc33680edf9327605e93b638540ca9f2b2 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Sun, 4 Feb 2024 11:27:35 +0800 Subject: [PATCH] Revert "some changes to support fine-tuning on Intel GPU (#88)" (#95) This reverts commit a555e0c1b80da8fce1a2e0cb6d7ad2cf3c42a281. --- .github/workflows/workflow_finetune.yml | 4 +- .github/workflows/workflow_inference.yml | 4 +- .../model/huggingface_model_for_causal_lm.py | 13 +---- common/trainer/default_trainer.py | 23 ++------ common/trainer/rm_trainer.py | 4 +- docs/finetune_parameters.md | 3 -- finetune/finetune.py | 53 ++++--------------- finetune/finetune.yaml | 4 -- finetune/finetune_config.py | 9 ---- pyproject.toml | 12 ++--- 10 files changed, 27 insertions(+), 102 deletions(-) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 5a8e32720..181f46df4 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -11,10 +11,10 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.149:911' + default: 'http://proxy-chain.intel.com:911' https_proxy: type: string - default: 'http://10.24.221.149:911' + default: 'http://proxy-chain.intel.com:911' runner_config_path: type: string default: '/home/ci/llm-ray-actions-runner' diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 03269a4a4..8b3ac1d56 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -11,10 +11,10 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.149:911' + default: 'http://proxy-chain.intel.com:911' https_proxy: type: string - default: 'http://10.24.221.149:911' + default: 'http://proxy-chain.intel.com:911' runner_config_path: type: string default: '/home/ci/llm-ray-actions-runner' diff --git a/common/model/huggingface_model_for_causal_lm.py b/common/model/huggingface_model_for_causal_lm.py index cc2ce6234..30ad5a809 100644 --- a/common/model/huggingface_model_for_causal_lm.py +++ b/common/model/huggingface_model_for_causal_lm.py @@ -8,12 +8,8 @@ class HuggingFaceModelForCausalLM(Model): def __call__(self, config): name = config.get("name") - model_dtype = config.get("dtype") model_config = config.get("config", {}) - model = transformers.AutoModelForCausalLM.from_pretrained( - name, torch_dtype=model_dtype, **model_config - ) - + model = transformers.AutoModelForCausalLM.from_pretrained(name, **model_config) lora_config = config.get("lora_config", None) if lora_config: peft_config = LoraConfig(**lora_config) @@ -21,11 +17,4 @@ def __call__(self, config): deltatuner_config = config.get("deltatuner_config", None) if deltatuner_config: model = deltatuner.optimize(model, **deltatuner_config) - - enable_gradient_checkpointing = config.get("enable_gradient_checkpointing") - if enable_gradient_checkpointing: - model.enable_input_require_grads() - model.gradient_checkpointing_enable() - model.config.use_cache = False - return model diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index f05c6317f..a33ac2bdd 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -85,6 +85,8 @@ def _get_lr_scheduler( num_steps_per_epoch, accelerator, ): + # gradient_accumulation_steps = accelerator.gradient_accumulation_steps + # num_update_steps_per_epoch = math.ceil(num_steps_per_epoch / gradient_accumulation_steps) enable = lr_scheduler_config.get("enable", False) if not enable: return None @@ -151,7 +153,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator): def train(self): num_train_epochs = self.config.get("num_train_epochs", 1) checkpoint = self.config.get("checkpoint") - logging_steps = self.config.get("logging_steps", 1) + log_step = self.config.get("log_step", 1) max_train_step = self.config.get("max_train_step") max_eval_step = self.config.get("max_eval_step") for idx in range(self.starting_epoch, num_train_epochs, 1): @@ -168,17 +170,12 @@ def train(self): if self.lr_scheduler is not None: self.lr_scheduler.step() self.optimizer.zero_grad() - - if step % logging_steps == 0: - loss = loss.item() - ppl = math.exp(loss) + if step % log_step == 0: logger.info( - f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" + f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}" ) report( { - "loss": loss, - "ppl": ppl, "train_epoch": idx, "total_epochs": num_train_epochs, "train_step": step, @@ -187,10 +184,6 @@ def train(self): else total_steps, } ) - self.accelerator.log( - {"train loss": loss, "train perplexity": ppl}, - step=idx * total_steps + step, - ) start = time.time() if max_train_step is not None: if step >= max_train_step - 1: @@ -221,9 +214,6 @@ def train(self): except OverflowError: eval_loss = float("inf") perplexity = float("inf") - self.accelerator.log( - {"evaluate loss": eval_loss, "evaluate perplexity": perplexity} - ) logger.info( f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]" ) @@ -242,9 +232,6 @@ def train(self): save_function=self.accelerator.save, ) logger.info(f"finish save model to {output}") - - self.accelerator.end_training() - self.accelerator.wait_for_everyone() def _get_local_path(self, root_path, model_name): diff --git a/common/trainer/rm_trainer.py b/common/trainer/rm_trainer.py index 1cc64d93e..0ee8ee7eb 100644 --- a/common/trainer/rm_trainer.py +++ b/common/trainer/rm_trainer.py @@ -52,7 +52,7 @@ def compute_loss(self, batch, return_outputs=False): def train(self): num_train_epochs = self.config.get("num_train_epochs", 1) - logging_steps = self.config.get("logging_steps", 1) + log_step = self.config.get("log_step", 1) if not os.path.exists(self.config.get("log_path", ".")): os.makedirs(self.config.get("log_path", "."), exist_ok=True) writer = SummaryWriter(self.config.get("log_path", ".")) @@ -69,7 +69,7 @@ def train(self): if self.lr_scheduler is not None: self.lr_scheduler.step() self.optimizer.zero_grad() - if step % logging_steps == 0: + if step % log_step == 0: logger.info( f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{len(self.train_dataloader)}]\tloss:{loss}\tppl:{math.exp(loss)}\ttime:{time.time()-start}" ) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index a10bb8c33..39cd1239e 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -10,11 +10,9 @@ The following are the parameters supported in the finetuning workflow. |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.| |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model| |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint| -|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers| |config|trust_remote_code: False
use_auth_token: None|Will be passed to the transformers `from_pretrained()` method| |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| -|enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| ## Dataset Parameters @@ -42,4 +40,3 @@ The following are the parameters supported in the finetuning workflow. |max_train_steps|None|Total number of training steps to perform. If provided, overrides epochs.| |gradient_accumulation_steps|1|Number of updates steps to accumulate before performing a backward/update pass.| |seed|None|A seed for reproducible training.| -|logging_steps|10|logging per steps| diff --git a/finetune/finetune.py b/finetune/finetune.py index 7ab0183db..0815dabfe 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -4,7 +4,6 @@ import argparse from typing import Any, Dict, Union -import torch import accelerate from accelerate.utils import is_xpu_available @@ -63,14 +62,6 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], return mode_env_vars[mode] -def convert_dtype(dtype: str) -> torch.dtype: - supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} - if dtype in supported_dtypes: - return supported_dtypes[dtype] - else: - raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]") - - def train_func(config: Dict[str, Any]): cwd = config.get("cwd") if cwd: @@ -88,26 +79,9 @@ def train_func(config: Dict[str, Any]): ) else: fsdp_plugin = None - - log_with = "tensorboard" # only support tensorboard as tracker - output_dir = config["General"]["output_dir"] - tracking_dir = config["General"]["tracking_dir"] accelerator = accelerate.Accelerator( - gradient_accumulation_steps=gradient_accumulation_steps, - fsdp_plugin=fsdp_plugin, - log_with=log_with, - project_dir=tracking_dir, + gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin ) - epochs = config["Training"]["epochs"] - tracker_config = { - "epochs": epochs, - "learning_rate": config["Training"]["learning_rate"], - "batch_size": config["Training"]["batch_size"], - } - base_model = config["General"]["base_model"] - dataset_file = config["Dataset"]["train_file"] - accelerator.init_trackers("fine-tuning", config=tracker_config) - common.logger.info( f"accelerator generate finish, accelerator device type = {accelerator.device}" ) @@ -118,7 +92,7 @@ def train_func(config: Dict[str, Any]): datasets = common.dataset.Dataset.registory.get("HuggingfaceDataset")()( config={ - "name": dataset_file, + "name": config["Dataset"]["train_file"], "validation_file": config["Dataset"]["validation_file"], "validation_split_percentage": config["Dataset"]["validation_split_percentage"], } @@ -126,17 +100,15 @@ def train_func(config: Dict[str, Any]): tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()( config={ - "name": base_model, + "name": config["General"]["base_model"], "config": config["General"]["config"], } ) model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()( config={ - "name": base_model, - "dtype": convert_dtype(config["Training"]["mixed_precision"]), + "name": config["General"]["base_model"], "config": config["General"]["config"], - "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"], "lora_config": config["General"]["lora_config"] if config["General"].get("lora_config") else None, @@ -153,10 +125,10 @@ def train_func(config: Dict[str, Any]): trainer = common.trainer.Trainer.registory.get("DefaultTrainer")( config={ - "num_train_epochs": epochs, + "num_train_epochs": config["Training"]["epochs"], "max_train_step": config["Training"].get("max_train_steps", None), - "logging_steps": config["Training"].get("logging_steps", 1), - "output": output_dir, + "log_step": 1, + "output": config["General"]["output_dir"], "dataprocesser": { "type": "GeneralProcesser", "per_device_train_batch_size": config["Training"]["batch_size"], @@ -245,21 +217,14 @@ def main(external_config=None): "FI_PROVIDER": "tcp", } } + accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config) runtime_env["env_vars"].update(accelerate_env_vars) if config["General"]["gpt_base_model"] is True: runtime_env["pip"] = ["transformers==4.26.0"] - import intel_extension_for_pytorch as ipex - - if "xpu" in ipex.__version__: - num_cpus = ( - resources_per_worker["CPU"] * num_training_workers + 1 - ) # additional 1 for head worker - ray.init(num_cpus=num_cpus, runtime_env=runtime_env) - else: - ray.init(runtime_env=runtime_env) + ray.init(runtime_env=runtime_env) common.logger.info(f"ray available resources = {ray.available_resources()}") diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml index 285520d82..f0092022d 100644 --- a/finetune/finetune.yaml +++ b/finetune/finetune.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null @@ -12,7 +11,6 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 - enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -30,5 +28,3 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP - gradient_accumulation_steps: 2 - logging_steps: 10 diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index 6a269b7ee..fc4fe3872 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -26,11 +26,9 @@ class General(BaseModel): gpt_base_model: bool output_dir: str checkpoint_dir: str - tracking_dir: str config: GeneralConfig lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None - enable_gradient_checkpointing: bool = False class Dataset(BaseModel): @@ -56,8 +54,6 @@ class Training(BaseModel): resources_per_worker: RayResourceConfig accelerate_mode: str mixed_precision: str = "no" - gradient_accumulation_steps: int - logging_steps: int = 10 @validator("device") def check_device(cls, v: str): @@ -73,11 +69,6 @@ def check_accelerate_mode(cls, v: str): raise ValueError(f"accelerate_mode must be one of {modes}") return v - @validator("logging_steps") - def check_logging_steps(cls, v: int): - assert v > 0 - return v - # @model_validator(mode='after') # def check_device_and_accelerate_mode(self) -> "Training": # dev = self.device diff --git a/pyproject.toml b/pyproject.toml index 70f87b2f4..8535c9ed2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "peft>=0.4.0", "deltatuner==1.1.9", "py-cpuinfo", - "pydantic-yaml" + "pydantic-yaml", ] [project.optional-dependencies] @@ -48,11 +48,11 @@ cpu = [ gpu = [ "transformers>=4.35.0", - "torch==2.1.0a0", - "torchvision==0.16.0a0", - "intel_extension_for_pytorch==2.1.10+xpu", - "oneccl_bind_pt==2.1.100+xpu", - "dpctl==0.15.0" + "torch==2.0.1a0", + "torchvision==0.15.2a0", + "intel-extension-for-pytorch==2.0.110+xpu", + "oneccl_bind_pt==2.0.100+gpu", + "dpctl==0.14.5" ] deepspeed = [