Stuck forever in accelerator.backward without any logs 2 #3347

XueruiSu · 2025-01-16T04:38:10Z

I meet the similar error about the stuck forever in accelerator.backward without any logs. Here is my backward code:

def tsrl_step(
        self, 
        prompts_list: list[torch.Tensor], 
        input_ids_list: list[torch.Tensor],
        attention_mask_list: list[torch.Tensor],
        prediction: tuple = None,
        init_value_list: list[float] = None,
        max_n_sample: int = 8,
        cur_max_new_tokens: int = 32,
    ) -> dict[str, Any]:
        print( [prompt.shape for prompt in prompts_list], [prompt.shape for prompt in input_ids_list], [prompt.shape for prompt in attention_mask_list], )
        print(prediction, init_value_list, max_n_sample, cur_max_new_tokens)
        losses, better_sample_rewards, worse_sample_rewards, max_lengths = [], [], [], []
        n_sample = len(input_ids_list)
        start = prompts_list[0].size(-1) - 1
        better_idx = -1
        worse_idx = 0 if self.args.choose_worst else -2
        
        all_better_input_ids, all_worse_input_ids = [], []
        all_better_attention_mask, all_worse_attention_mask = [], []
        all_init_value_list = []
        for sample_id in range(n_sample):
            if len(all_better_input_ids) >= max_n_sample: break
            
            input_ids = input_ids_list[sample_id]
            attention_mask = attention_mask_list[sample_id]
            
            n_output = input_ids.size(0)
            if n_output < 2: continue
            
            if self.args.choose_random:
                worse_idx = random.choice(range(n_output - 1))
                
            all_better_input_ids.append(input_ids[better_idx])
            all_worse_input_ids.append(input_ids[worse_idx])
            all_better_attention_mask.append(attention_mask[better_idx])
            all_worse_attention_mask.append(attention_mask[worse_idx])
            all_init_value_list.extend([init_value_list[sample_id][better_idx], init_value_list[sample_id][worse_idx]])
        all_input_ids = pad_tensors(all_better_input_ids + all_worse_input_ids, pad_value=self.tokenizer.pad_token_id) # torch.Size([2, 209])
        all_attention_mask = pad_tensors(all_better_attention_mask + all_worse_attention_mask, pad_value=False) # torch.Size([2, 209])
        print(all_input_ids.shape, all_attention_mask.shape)
        print(self.actor_model, self.actor_reference_model, self.actor_model.device, self.actor_reference_model.device)
        torch.cuda.empty_cache()
        all_sequence_log_probs = self.compute_log_probs(
            self.actor_model,
            input_ids=all_input_ids,
            attention_mask=all_attention_mask,
        ) #  torch.Size([2, 208])
        all_better_input_ids, all_worse_input_ids = all_input_ids.chunk(chunks=2, dim=0)
        all_better_attention_mask, all_worse_attention_mask = all_attention_mask.chunk(chunks=2, dim=0)
        all_better_sequence_log_probs, all_worse_sequence_log_probs = all_sequence_log_probs.chunk(chunks=2, dim=0) # torch.Size([1, 208]) torch.Size([1, 208])
        print(all_better_input_ids.shape, all_worse_input_ids.shape)
        print(all_better_attention_mask.shape, all_worse_attention_mask.shape)
        print(all_better_sequence_log_probs.shape, all_worse_sequence_log_probs.shape)
        
        label_smoothing_values = []
        for sample_id in range(len(all_better_input_ids)):
            better_input_ids = all_better_input_ids[sample_id] # torch.Size([209])
            better_attention_mask = all_better_attention_mask[sample_id] # torch.Size([209])
            
            worse_input_ids = all_worse_input_ids[sample_id] # torch.Size([209])
            worse_attention_mask = all_worse_attention_mask[sample_id] # torch.Size([209])
            
            init_values = [all_init_value_list[sample_id * 2], all_init_value_list[sample_id * 2 + 1]]
            better_sequence_log_probs, worse_sequence_log_probs = all_better_sequence_log_probs[sample_id], all_worse_sequence_log_probs[sample_id]
            
            print(better_sequence_log_probs.shape, worse_sequence_log_probs.shape)
            with torch.no_grad():
                torch.cuda.empty_cache()
                ref_better_sequence_log_probs = self.compute_log_probs(
                    self.actor_reference_model,
                    input_ids=better_input_ids.unsqueeze(0),
                    attention_mask=better_attention_mask.unsqueeze(0),
                )[0] # torch.Size([208])
                torch.cuda.empty_cache()
                ref_worse_sequence_log_probs = self.compute_log_probs(
                    self.actor_reference_model,
                    input_ids=worse_input_ids.unsqueeze(0),
                    attention_mask=worse_attention_mask.unsqueeze(0),
                )[0] # torch.Size([208])
            print(ref_better_sequence_log_probs.shape, ref_worse_sequence_log_probs.shape)
            
            better_end_index = better_attention_mask.nonzero()[-1]
            worse_end_index = worse_attention_mask.nonzero()[-1]
            try:
                diverge_index = (better_input_ids != worse_input_ids).nonzero()[0]
                assert 0 <= diverge_index <= better_end_index, 'diverge index is out of range!'
                assert 0 <= diverge_index <= worse_end_index, 'diverge index is out of range!'
            except:
                continue
            
            better_seq_slice = slice(diverge_index - 1, better_end_index)
            worse_seq_slice = slice(diverge_index - 1, worse_end_index)
            
            better_log_probs = better_sequence_log_probs[better_seq_slice].sum(dim=-1)
            worse_log_probs = worse_sequence_log_probs[worse_seq_slice].sum(dim=-1)
            ref_better_log_probs = ref_better_sequence_log_probs[better_seq_slice].sum(dim=-1)
            ref_worse_log_probs = ref_worse_sequence_log_probs[worse_seq_slice].sum(dim=-1)
            better_log_ratio = better_log_probs - ref_better_log_probs
            worse_log_ratio = worse_log_probs - ref_worse_log_probs
            if self.args.norm_prob or self.args.ipo:
                better_log_ratio /= better_attention_mask[better_seq_slice].sum(dim=-1) ** self.args.length_penalty
                worse_log_ratio /= worse_attention_mask[worse_seq_slice].sum(dim=-1) ** self.args.length_penalty
            logits = better_log_ratio - worse_log_ratio
            
            if self.args.ipo:
                losses.append((logits - 1 / (2 * self.scale_coeff)) ** 2)
            elif self.args.conservative:
                qb, qw = init_values
                confidence = calculate_preference_confidence(qb, qw)
                label_smoothing = min(1 - confidence, 0.5)
                losses.append(
                    - F.logsigmoid(self.scale_coeff * logits) * (1 - label_smoothing)
                    - F.logsigmoid(-self.scale_coeff * logits) * label_smoothing
                )
                label_smoothing_values.append(label_smoothing)
            else:
                losses.append(-F.logsigmoid(self.scale_coeff * logits))
            better_sample_rewards.append(self.scale_coeff * better_log_ratio.detach())
            worse_sample_rewards.append(self.scale_coeff * worse_log_ratio.detach())
            
            max_lengths.append(better_attention_mask[start:].float().sum())
            max_lengths.append(worse_attention_mask[start:].float().sum())
        
        if not len(losses): return {}
        
        loss = torch.stack(losses).mean()
        max_generated_length = torch.stack(max_lengths).max()
        total_max_generated_length = max_generated_length + start
        better_sample_rewards = torch.stack(better_sample_rewards)  # size = (B,)
        worse_sample_rewards = torch.stack(worse_sample_rewards)  # size = (B,)
        rewards_accuracy = (
            (better_sample_rewards > worse_sample_rewards).float().mean()
        )  # size = ()
        better_sample_rewards = better_sample_rewards.mean()  # size = ()
        worse_sample_rewards = worse_sample_rewards.mean()  # size = ()
        rewards = better_sample_rewards + worse_sample_rewards  # size = ()
        rewards_margin = better_sample_rewards - worse_sample_rewards  # size = ()
        
        torch.cuda.empty_cache()
        # loss = all_sequence_log_probs.mean()
        self.optimizer.zero_grad()
        print("losslosslosslosslosslosslosslosslosslosslossloss-------", loss)
        # self.accelerator.wait_for_everyone()
        print(self.accelerator)
        self.accelerator.backward(loss)
        print("self.accelerator.backward(loss) done")
        print("loss backward-------")
        self.optimizer.step()
        self.lr_scheduler.step()
        print("loss step-------")
        
        loss = get_all_reduce_mean(loss)
        rewards = get_all_reduce_mean(rewards)
        better_sample_rewards = get_all_reduce_mean(better_sample_rewards)
        worse_sample_rewards = get_all_reduce_mean(worse_sample_rewards)
        rewards_accuracy = get_all_reduce_mean(rewards_accuracy)
        rewards_margin = get_all_reduce_mean(rewards_margin)
        max_generated_length = get_all_reduce_max(max_generated_length)
        total_max_generated_length = get_all_reduce_max(total_max_generated_length)
        
        return {
            'train/loss': loss.item(),
            'train/rewards': rewards.item(),
            'train/better_sample_rewards': better_sample_rewards.item(),
            'train/worse_sample_rewards': worse_sample_rewards.item(),
            'train/rewards_accuracy': rewards_accuracy.item(),
            'train/rewards_margin': rewards_margin.item(),
            'train/lr': self.actor_model.optimizer.param_groups[0]['lr'],
            'train/r_scores': float(prediction[0]),
            'train/correct': float(prediction[1]),
            'train/n_sample': n_sample,
            'train/max_generated_length': max_generated_length.item(),
            'train/total_max_generated_length': total_max_generated_length.item(),
            'train/label_smoothing': sum(label_smoothing_values) / len(label_smoothing_values) if len(label_smoothing_values) else 0,
            'train/cur_max_new_tokens': cur_max_new_tokens,
        }

I tried in 1 GPU with same code, it worked but stuck when using 4 GPUs. Below is the output when using 1 GPU:

losslosslosslosslosslosslosslosslosslosslossloss------- tensor(0.6932, device='cuda:0', grad_fn=<MeanBackward0>)
  Invalidate trace cache @ step 422 and module 0: cache has only 422 modules
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Using non-device net plugin version 0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Using network Socket
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO bootstrapSplit: comm 0x76bf9803f340 parent 0xfe0a210 rank 0 nranks 1 color -1091263299 key 0 prev 0 next 0 - DONE
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO ncclCommSplit comm 0x76bf9803f340 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 100000 parent 0xfe0a210 color -1091263299 key 0 commId 0x55efba2da9b35614 - Init START
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO comm 0x76bf9803f340 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 00/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 01/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 02/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 03/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 04/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 05/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 06/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 07/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 08/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 09/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 10/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 11/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 12/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 13/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 14/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 15/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 16/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 17/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 18/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 19/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 20/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 21/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 22/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 23/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 24/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 25/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 26/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 27/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 28/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 29/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 30/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Channel 31/32 :    0
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO P2P Chunksize set to 131072
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Connected all rings
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO Connected all trees
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO 32 coll channels, 32 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
  GCRAZGDL1146:4143282:4162655 [0] NCCL INFO ncclCommSplit comm 0x76bf9803f340 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 100000 parent 0xfe0a210 color -1091263299 key 0 commId 0x55efba2da9b35614 - Init COMPLETE
  loss backward------- tensor(0.6932, device='cuda:0', grad_fn=<MeanBackward0>)
  loss step------- tensor(0.6932, device='cuda:0', grad_fn=<MeanBackward0>)

And below is the output when using 4 GPUs:

losslosslosslosslosslosslosslosslosslosslossloss------- tensor(0.6935, device='cuda:0', grad_fn=<MeanBackward0>)
<accelerate.accelerator.Accelerator object at 0x7f2b085b42e0>
rl4svm2:96486:98083 [0] NCCL INFO AllReduce: opCount 246 sendbuff 0x7f1f84000000 recvbuff 0x7f1f84000000 count 525336576 datatype 9 op 0 root 0 comm 0x11753a70 [nranks=4] stream 0xc8fe8a0
rl4svm2:96486:98083 [0] NCCL INFO 1050673152 Bytes -> Algo 1 proto 2 time 105109.914062

Before the line of "print("losslosslosslosslosslosslosslosslosslosslossloss-------", loss)". The code will print the output It should print no matter I use 1 GPU or 4 GPUs.

Where should I start debugging? I am currently using the deepspeed setting based on accelerate:

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  zero_stage: 2
distributed_type: DEEPSPEED
enable_cpu_affinity: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false

Originally posted by @XueruiSu in #287

The text was updated successfully, but these errors were encountered:

muellerzr · 2025-01-16T09:50:26Z

Can you please give us the output of accelerate env

XueruiSu · 2025-01-17T06:20:21Z

My accelerate env is below:

- `Accelerate` version: 1.2.0
- Platform: Linux-5.15.0-1074-azure-x86_64-with-glibc2.31
- `accelerate` bash location: /home/msrai4srl4s/miniconda3/envs/mcts-dpo/bin/accelerate
- Python version: 3.10.16
- Numpy version: 1.26.4
- PyTorch version (GPU?): 2.5.1+cu118 (True)
- PyTorch XPU available: False
- PyTorch NPU available: False
- PyTorch MLU available: False
- PyTorch MUSA available: False
- System RAM: 866.06 GB
- GPU type: NVIDIA A100 80GB PCIe
- `Accelerate` default config:
        - compute_environment: LOCAL_MACHINE
        - distributed_type: MULTI_GPU
        - mixed_precision: fp16
        - use_cpu: False
        - debug: False
        - num_processes: 2
        - machine_rank: 0
        - num_machines: 1
        - gpu_ids: 1,3
        - rdzv_backend: static
        - same_network: True
        - main_training_function: main
        - enable_cpu_affinity: True
        - downcast_bf16: no
        - tpu_use_cluster: False
        - tpu_use_sudo: False
        - tpu_env: []

Here is my config used for my experiment:

# zero stage 2:
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  zero_stage: 2
distributed_type: DEEPSPEED
enable_cpu_affinity: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false

And the launch code:

export CUDA_VISIBLE_DEVICES=0,1,2,3
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_P2P_DISABLE=0
export NCCL_DEBUG_SUBSYS=ALL

accelerate launch --config_file "$config_file" \
   	--module reasoning.mcts \
	--train_datasets MathQA/train \
	--model_type llama3 \
	--choose_worst \
	--save_mcts_data \
	--filter \
	--iteration_interval 64 \
	--actor_model_name_or_path "${ACTOR_MODEL_NAME_OR_PATH}" \
	--actor_ref_model_name_or_path "${ACTOR_REF_MODEL_NAME_OR_PATH}" \
	--scale_coeff 0.1 \
	--max_length 512 \
	--temperature 1.0 \
	--init_temperature 1.0 \
	--mcts_length_penalty 1.25 \
	--num_return_sequences 1 \
	--repetition_penalty 1.0 \
	--trust_remote_code True \
	--epochs 1 \
        ...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Stuck forever in accelerator.backward without any logs 2 #3347

Stuck forever in accelerator.backward without any logs 2 #3347

XueruiSu commented Jan 16, 2025

muellerzr commented Jan 16, 2025

XueruiSu commented Jan 17, 2025

Stuck forever in accelerator.backward without any logs 2 #3347

Stuck forever in accelerator.backward without any logs 2 #3347

Comments

XueruiSu commented Jan 16, 2025

muellerzr commented Jan 16, 2025

XueruiSu commented Jan 17, 2025