Skip to content

Commit

Permalink
remove nightly ray, and use newest release version (#15)
Browse files Browse the repository at this point in the history
* remove nightly ray, and use newest release version

* update

* update

* update
  • Loading branch information
harborn authored Dec 28, 2023
1 parent 91df9e9 commit 7a2b54a
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 5 deletions.
6 changes: 4 additions & 2 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ def recovery(self, config):
self.starting_epoch = checkpoint_epoch["epoch"] + 1

logger.info(f"recovery to epoch {self.starting_epoch}")
except FileNotFoundError as e:
logger.info(e)
except Exception as e:
logger.warning(f"recovery error", exc_info=True)
logger.warning("recovery error", exc_info=True)

def _coordinate(self, accelerator):
self.accelerator = accelerator
Expand Down Expand Up @@ -174,7 +176,7 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss}]\tppl:[{perplexity}]\ttime:[{time.time()-start}]")
logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]")

if checkpoint is not None:
self.save(checkpoint, idx)
Expand Down
3 changes: 3 additions & 0 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def train_func(config: Dict[str, Any]):
trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(config = {
"num_train_epochs": config["Training"]["epochs"],
"max_train_step": config["Training"].get("max_train_steps", None),
"log_step": 1,
"output": config["General"]["output_dir"],
"dataprocesser": {
"type": "GeneralProcesser",
Expand Down Expand Up @@ -200,6 +201,8 @@ def main(external_config = None):

ray.init(runtime_env = runtime_env)

common.logger.info(f"ray available resources = {ray.available_resources()}")

scaling_config = ScalingConfig(
num_workers = num_training_workers,
use_gpu = use_gpu,
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies = [
"accelerate",
"datasets>=2.14.6",
"numpy",
"ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl",
"ray>=2.9",
"typing>=3.7.4.3",
"tabulate",
"ray[tune]",
Expand Down Expand Up @@ -52,8 +52,8 @@ gpu = [
"torch==2.0.1a0",
"torchvision==0.15.2a0",
"intel-extension-for-pytorch==2.0.110+xpu",
"oneccl_bind_pt",
"dpctl"
"oneccl_bind_pt==2.0.100+gpu",
"dpctl==0.14.5"
]

deepspeed = [
Expand Down

0 comments on commit 7a2b54a

Please sign in to comment.