ERROR prevent resume #54

crapthings · 2024-10-25T04:22:52Z

File "/InSPyReNet/run/Train.py", line 175, in <module> [00:00<?, ?it/s]
                                                                         Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>                 
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
Epoch 50%|████████████████████                    | 60/120 [00:07<?, ?it/s]
TypeError: '<' not supported between instances of 'complex' and 'float'
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
Traceback (most recent call last):
      File "/InSPyReNet/run/Train.py", line 175, in <module>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    scheduler.step()
      File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
      File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr invalues = self.get_lr()

  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
TypeError: '<' not supported between instances of 'complex' and 'float'
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in    
scheduler.step()  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>

  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654567 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654568 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654571 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654573 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654574 closing signal SIGTERM
[2024-10-25 04:18:40,324] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 654565) of binary: /usr/bin/python
Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 812, in main
    run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 803, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 135, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
run/Train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 654566)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 4 (local_rank: 4)
  exitcode  : 1 (pid: 654569)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 5 (local_rank: 5)
  exitcode  : 1 (pid: 654570)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 7 (local_rank: 7)
  exitcode  : 1 (pid: 654572)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 654565)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

The text was updated successfully, but these errors were encountered:

crapthings · 2024-10-25T04:24:05Z

these code sugguest by claude 3.5
it can resume trainning, but i don't know if its working or not

from torch.optim.lr_scheduler import _LRScheduler

class PolyLr(_LRScheduler):
    def __init__(self, optimizer, gamma, max_iteration, minimum_lr=0, warmup_iteration=0, last_epoch=-1):
        self.gamma = gamma
        self.max_iteration = max_iteration
        self.minimum_lr = minimum_lr
        self.warmup_iteration = warmup_iteration
        
        # Remove these lines as they're handled by parent class
        # self.last_epoch = None
        # self.base_lrs = []

        super(PolyLr, self).__init__(optimizer, last_epoch)

    def poly_lr(self, base_lr, step):
        # Ensure step doesn't exceed max_iteration to avoid negative values
        step = min(float(step), self.max_iteration)
        return (base_lr - self.minimum_lr) * (max(0, 1 - (step / self.max_iteration)) ** self.gamma) + self.minimum_lr

    def warmup_lr(self, base_lr, alpha):
        # Ensure alpha is between 0 and 1
        alpha = max(0.0, min(1.0, float(alpha)))
        return base_lr * (1 / 10.0 * (1 - alpha) + alpha)

    def get_lr(self):
        if self.last_epoch < self.warmup_iteration:
            alpha = self.last_epoch / self.warmup_iteration
            lrs = [self.warmup_lr(base_lr, alpha) for base_lr in self.base_lrs]
        else:
            lrs = [self.poly_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]

        return lrs




# from torch.optim.lr_scheduler import _LRScheduler


# class PolyLr(_LRScheduler):
#     def __init__(self, optimizer, gamma, max_iteration, minimum_lr=0, warmup_iteration=0, last_epoch=-1):
#         self.gamma = gamma
#         self.max_iteration = max_iteration
#         self.minimum_lr = minimum_lr
#         self.warmup_iteration = warmup_iteration
        
#         self.last_epoch = None
#         self.base_lrs = []

#         super(PolyLr, self).__init__(optimizer, last_epoch)

#     def poly_lr(self, base_lr, step):
#         return (base_lr - self.minimum_lr) * ((1 - (step / self.max_iteration)) ** self.gamma) + self.minimum_lr

#     def warmup_lr(self, base_lr, alpha):
#         return base_lr * (1 / 10.0 * (1 - alpha) + alpha)

#     def get_lr(self):
#         if self.last_epoch < self.warmup_iteration:
#             alpha = self.last_epoch / self.warmup_iteration
#             lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
#                     self.base_lrs]
#         else:
#             lrs = [self.poly_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]

#         return lrs

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ERROR prevent resume #54

ERROR prevent resume #54

crapthings commented Oct 25, 2024

crapthings commented Oct 25, 2024

ERROR prevent resume #54

ERROR prevent resume #54

Comments

crapthings commented Oct 25, 2024

crapthings commented Oct 25, 2024