Skip to content

Commit

Permalink
remove nightly ray, and use newest release version
Browse files Browse the repository at this point in the history
  • Loading branch information
harborn committed Dec 27, 2023
1 parent 91df9e9 commit 7e56af6
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 10 deletions.
6 changes: 4 additions & 2 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ def recovery(self, config):
self.starting_epoch = checkpoint_epoch["epoch"] + 1

logger.info(f"recovery to epoch {self.starting_epoch}")
except FileNotFoundError as e:
logger.info(e)
except Exception as e:
logger.warning(f"recovery error", exc_info=True)
logger.warning("recovery error", exc_info=True)

def _coordinate(self, accelerator):
self.accelerator = accelerator
Expand Down Expand Up @@ -174,7 +176,7 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss}]\tppl:[{perplexity}]\ttime:[{time.time()-start}]")
logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]")

if checkpoint is not None:
self.save(checkpoint, idx)
Expand Down
8 changes: 6 additions & 2 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def train_func(config: Dict[str, Any]):
trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(config = {
"num_train_epochs": config["Training"]["epochs"],
"max_train_step": config["Training"].get("max_train_steps", None),
"log_step": 1,
"output": config["General"]["output_dir"],
"dataprocesser": {
"type": "GeneralProcesser",
Expand Down Expand Up @@ -159,7 +160,7 @@ def get_finetune_config():

with open(config_file) as f:
finetune_config = parse_yaml_raw_as(FinetuneConfig, f)
return finetune_config.dict()
return finetune_config.model_dump()


def main(external_config = None):
Expand All @@ -177,6 +178,7 @@ def main(external_config = None):

use_cpu = True if accelerate_mode.startswith("CPU") else False
use_gpu = True if accelerate_mode.startswith("GPU") else False
num_cpus = num_training_workers * resources_per_worker["CPU"]
ccl_worker_count = 1 if use_cpu is True else num_training_workers

if not ray.is_initialized():
Expand All @@ -198,7 +200,9 @@ def main(external_config = None):
if config["General"]["gpt_base_model"] == True:
runtime_env["pip"] = ["transformers==4.26.0"]

ray.init(runtime_env = runtime_env)
ray.init(num_cpus=num_cpus + 1, runtime_env=runtime_env) # head worker need 1 cpu

common.logger.info(f"ray available resources = {ray.available_resources()}")

scaling_config = ScalingConfig(
num_workers = num_training_workers,
Expand Down
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies = [
"accelerate",
"datasets>=2.14.6",
"numpy",
"ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl",
"ray>=2.9",
"typing>=3.7.4.3",
"tabulate",
"ray[tune]",
Expand Down Expand Up @@ -49,11 +49,11 @@ cpu = [

gpu = [
"transformers>=4.35.0",
"torch==2.0.1a0",
"torchvision==0.15.2a0",
"intel-extension-for-pytorch==2.0.110+xpu",
"oneccl_bind_pt",
"dpctl"
"torch==2.1.0a0",
"torchvision==0.16.0a0",
"intel-extension-for-pytorch==2.1.10+xpu",
"oneccl_bind_pt==2.1.100+xpu",
"dpctl==0.15.0"
]

deepspeed = [
Expand Down

0 comments on commit 7e56af6

Please sign in to comment.