Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ERROR prevent resume #54

Open
crapthings opened this issue Oct 25, 2024 · 1 comment
Open

ERROR prevent resume #54

crapthings opened this issue Oct 25, 2024 · 1 comment

Comments

@crapthings
Copy link

File "/InSPyReNet/run/Train.py", line 175, in <module> [00:00<?, ?it/s]
                                                                         Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>                 
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
Epoch 50%|████████████████████                    | 60/120 [00:07<?, ?it/s]
TypeError: '<' not supported between instances of 'complex' and 'float'
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
Traceback (most recent call last):
      File "/InSPyReNet/run/Train.py", line 175, in <module>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    scheduler.step()
      File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
      File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr invalues = self.get_lr()

  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
TypeError: '<' not supported between instances of 'complex' and 'float'
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in    
scheduler.step()  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>

  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654567 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654568 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654571 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654573 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654574 closing signal SIGTERM
[2024-10-25 04:18:40,324] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 654565) of binary: /usr/bin/python
Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 812, in main
    run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 803, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 135, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
run/Train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 654566)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 4 (local_rank: 4)
  exitcode  : 1 (pid: 654569)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 5 (local_rank: 5)
  exitcode  : 1 (pid: 654570)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 7 (local_rank: 7)
  exitcode  : 1 (pid: 654572)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 654565)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
@crapthings
Copy link
Author

these code sugguest by claude 3.5
it can resume trainning, but i don't know if its working or not

image
from torch.optim.lr_scheduler import _LRScheduler

class PolyLr(_LRScheduler):
    def __init__(self, optimizer, gamma, max_iteration, minimum_lr=0, warmup_iteration=0, last_epoch=-1):
        self.gamma = gamma
        self.max_iteration = max_iteration
        self.minimum_lr = minimum_lr
        self.warmup_iteration = warmup_iteration
        
        # Remove these lines as they're handled by parent class
        # self.last_epoch = None
        # self.base_lrs = []

        super(PolyLr, self).__init__(optimizer, last_epoch)

    def poly_lr(self, base_lr, step):
        # Ensure step doesn't exceed max_iteration to avoid negative values
        step = min(float(step), self.max_iteration)
        return (base_lr - self.minimum_lr) * (max(0, 1 - (step / self.max_iteration)) ** self.gamma) + self.minimum_lr

    def warmup_lr(self, base_lr, alpha):
        # Ensure alpha is between 0 and 1
        alpha = max(0.0, min(1.0, float(alpha)))
        return base_lr * (1 / 10.0 * (1 - alpha) + alpha)

    def get_lr(self):
        if self.last_epoch < self.warmup_iteration:
            alpha = self.last_epoch / self.warmup_iteration
            lrs = [self.warmup_lr(base_lr, alpha) for base_lr in self.base_lrs]
        else:
            lrs = [self.poly_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]

        return lrs




# from torch.optim.lr_scheduler import _LRScheduler


# class PolyLr(_LRScheduler):
#     def __init__(self, optimizer, gamma, max_iteration, minimum_lr=0, warmup_iteration=0, last_epoch=-1):
#         self.gamma = gamma
#         self.max_iteration = max_iteration
#         self.minimum_lr = minimum_lr
#         self.warmup_iteration = warmup_iteration
        
#         self.last_epoch = None
#         self.base_lrs = []

#         super(PolyLr, self).__init__(optimizer, last_epoch)

#     def poly_lr(self, base_lr, step):
#         return (base_lr - self.minimum_lr) * ((1 - (step / self.max_iteration)) ** self.gamma) + self.minimum_lr

#     def warmup_lr(self, base_lr, alpha):
#         return base_lr * (1 / 10.0 * (1 - alpha) + alpha)

#     def get_lr(self):
#         if self.last_epoch < self.warmup_iteration:
#             alpha = self.last_epoch / self.warmup_iteration
#             lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
#                     self.base_lrs]
#         else:
#             lrs = [self.poly_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]

#         return lrs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant