remove nightly ray, and use newest release version (#15)

* remove nightly ray, and use newest release version * update * update * update
intel · Dec 28, 2023 · 7a2b54a · 7a2b54a
1 parent 91df9e9
commit 7a2b54a
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 5 deletions.
diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
@@ -57,8 +57,10 @@ def recovery(self, config):
                 self.starting_epoch = checkpoint_epoch["epoch"] + 1
 
             logger.info(f"recovery to epoch {self.starting_epoch}")
+        except FileNotFoundError as e:
+            logger.info(e)
         except Exception as e:
-            logger.warning(f"recovery error", exc_info=True)
+            logger.warning("recovery error", exc_info=True)
 
     def _coordinate(self, accelerator):
         self.accelerator = accelerator
@@ -174,7 +176,7 @@ def train(self):
                 except OverflowError:
                     eval_loss = float("inf")
                     perplexity = float("inf")
-                logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss}]\tppl:[{perplexity}]\ttime:[{time.time()-start}]")
+                logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]")
 
             if checkpoint is not None:
                 self.save(checkpoint, idx)

diff --git a/finetune/finetune.py b/finetune/finetune.py
@@ -108,6 +108,7 @@ def train_func(config: Dict[str, Any]):
     trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(config = {
         "num_train_epochs": config["Training"]["epochs"],
         "max_train_step": config["Training"].get("max_train_steps", None),
+        "log_step": 1,
         "output": config["General"]["output_dir"],
         "dataprocesser": {
             "type": "GeneralProcesser",
@@ -200,6 +201,8 @@ def main(external_config = None):
 
         ray.init(runtime_env = runtime_env)
 
+    common.logger.info(f"ray available resources = {ray.available_resources()}")
+
     scaling_config = ScalingConfig(
         num_workers = num_training_workers,
         use_gpu = use_gpu,

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "accelerate",
     "datasets>=2.14.6",
     "numpy",
-    "ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl",
+    "ray>=2.9",
     "typing>=3.7.4.3",
     "tabulate",
     "ray[tune]",
@@ -52,8 +52,8 @@ gpu = [
     "torch==2.0.1a0",
     "torchvision==0.15.2a0",
     "intel-extension-for-pytorch==2.0.110+xpu",
-    "oneccl_bind_pt",
-    "dpctl"
+    "oneccl_bind_pt==2.0.100+gpu",
+    "dpctl==0.14.5"
 ]
 
 deepspeed = [