Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ex1 #4

Merged
merged 1 commit into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python ../src/model/scripts/spm_training/train.py
8 changes: 8 additions & 0 deletions experiment_effect_of_pre-train/step1_2_submit_mergeSPM.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
MODEL_DIR=/workspace/model
SP_DIR=/workspace/data
OUTPUT_DIR=/workspace/output

python ../src/model/scripts/llama_thai_tokenizer/merge_tokenizer.py \
--llama_path $MODEL_DIR \
--sp_path $SP_DIR \
--output_path $OUTPUT_DIR
10 changes: 10 additions & 0 deletions experiment_effect_of_pre-train/step1_3_submit_updateModelVocab.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
MODEL_DIR=/workspace/model
TOKENIZER_DIR=/workspace/data
OUTPUT_DIR=/workspace/output



python update_ModelVocab.py \
--model_name_or_path $MODEL_DIR \
--tokenizer_name_or_path $TOKENIZER_DIR \
--output_dir $OUTPUT_DIR
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python ..src/model/scripts/lighting_training/data_preprocessing.py
103 changes: 103 additions & 0 deletions experiment_effect_of_pre-train/step3_smultinode_deepspeed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
#sleep 30
#fi_info -p efa -t FI_EP_RDM

# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script



module restore
module load Miniconda3
module load PrgEnv-gnu
module load cpe-cuda
module load cudatoolkit/22.7_11.7
module load craype-accel-nvidia80
# module load aws-ofi-nccl
module load gcc/10.3.0


conda deactivate
conda activate /project/lt200056-opgpth/boss/stanford_alpaca/env

# conda deactivate
# conda activate /project/lt200056-opgpth/multinode-fix/stanford_alpaca_init/conda

echo myuser=`whoami`
echo COUNT_NODE=$COUNT_NODE
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
echo PATH = $PATH
echo which mpicc `which mpicc`
echo HOSTNAMES = $HOSTNAMES
echo hostname = `hostname`
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT

H=`hostname`
THEID=`echo -e $HOSTNAMES | python -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
echo THEID=$THEID
echo SLURM_PROCID=$SLURM_PROCID

export NCCL_TIMEOUT=3600000
export NCCL_BLOCKING_WAIT=0


# source /fsx/dalle2/.dalle_env_38/bin/activate
# echo python3 version = `python3 --version`
# python -c "import torch"

MODEL_DIR=/workspace/model
TOKENIZER_DIR=/workspace/data
OUTPUT_DIR=/workspace/output
DATA_THA_DIR=/workspace/tha
DATA_ENG_DIR=/workspace/en


accelerate launch \
--num_processes $(( 4 * $COUNT_NODE )) \
--num_machines $COUNT_NODE \
--multi_gpu \
--mixed_precision fp16 \
--machine_rank $SLURM_PROCID \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
./train_v2.py \
--model_name_or_path $MODEL_DIR \
--tokenizer_name_or_path $TOKENIZER_DIR \
--use_flash_attention_2 False \
--data_path $DATA_THA_DIR \
$DATA_ENG_DIR \
--data_weights 0.9 0.1 \
--data_seed 42 \
--train_split train \
--eval_split eval \
--bf16 True \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "steps" \
--eval_steps 700 \
--save_strategy "steps" \
--save_steps 700 \
--save_total_limit 5 \
--logging_strategy 'steps' \
--logging_steps 1 \
--logging_first_step True \
--learning_rate 5e-5 \
--weight_decay 0.001 \
--warmup_ratio 0.03 \
--deepspeed ../src/model/scripts/hf_trainer/config/llama_deepspeed.json \
--tf32 True \
--gradient_checkpointing True \
--max_grad_norm 1.00 \
--lr_scheduler_type cosine



# --checkpoint /project/lt200056-opgpth/weight_llama_2_finetune_7b_512_th100/checkpoint-250 \

# --use_flash_attention_2 True \

# --fsdp "full_shard auto_wrap" \
# --gradient_checkpointing True
34 changes: 34 additions & 0 deletions experiment_effect_of_pre-train/step3_submit_multinode_deepspeed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@



export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=hsn
export NCCL_P2P_DISABLE=1
#export FI_MR_CACHE_MONITOR=memhooks
#export NCCL_NET_GDR_LEVEL=3
#export NCCL_NET=IB
#export NCCL_IB_HCA=mlx5
#export CXI_FORK_SAFE=1
#export CXI_FORK_SAFE_HP=1
#export FI_CXI_DISABLE_CQ_HUGETLB=1

#echo "using FI_MR_CACHE_MONITOR=memhooks"

START=`date`
starttime=$(date +%s)

export WANDB_MODE="offline"

# sent to sub script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`

echo go $COUNT_NODE
echo $HOSTNAMES

srun sh step3_smultinode_deepspeed.sh

current_date_time="`date "+%Y-%m-%d %H:%M:%S"`";
echo $current_date_time;
175 changes: 175 additions & 0 deletions experiment_effect_of_pre-train/train_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence, List

import torch
import transformers
from transformers import Trainer
from datasets import load_from_disk, Dataset
from torch.utils.data import IterableDataset

import os
import random


@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
tokenizer_name_or_path: Optional[str] = field(default=None)
use_flash_attention_2: bool = field(default=False)


@dataclass
class DataArguments:
data_path: Optional[List[str]] = field(
default_factory=list, metadata={"help": "Path to the tokenized data."}
)
data_weights: Optional[List[float]] = field(
default_factory=list
)
train_split: Optional[str] = field(default="train")
eval_split: Optional[str] = field(default="eval")


@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
checkpoint: Optional[str] = field(default=None)
model_max_length: int = field(
default=512,
metadata={
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." # noqa
},
)

class CombinedDataset(IterableDataset):
def __init__(self, datasets, seed, weights=None):
self._seed = seed
self._datasets = datasets
self._weights = weights

n_datasets = len(datasets)

if weights is None:
self._weights = [1 / n_datasets] * n_datasets

len_datasets = []
for dataset in self._datasets:
len_datasets.append(len(dataset))
self.total_len = int(len_datasets[0] * sum(self._weights))

def __iter__(self):
return CombinedDatasetIterator(self._datasets, self._seed, self._weights)

def __len__(self):
return self.total_len


class CombinedDatasetIterator:
def __init__(self, datasets, seed, weights):
self._datasets = [iter(el) for el in datasets]
self._weights = weights
self._rng = random.Random(seed)

def __next__(self):
(dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1)

return next(dataset)


class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""

def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids = [instance["input_ids"] for instance in instances]
input_ids = torch.tensor(input_ids) # type: ignore
return {
"input_ids": input_ids, # type: ignore
"labels": input_ids, # type: ignore
}

# def add_set(set_):
# def _func(sample):
# return {'set': set_}
# return _func

def load_dataset(paths, weights, split, seed=42):
datasets = []
# i = 0
for path in paths:
path_to_split = os.path.join(path, split)
dataset = load_from_disk(path_to_split)
# dataset = Dataset.from_dict(dataset[:100])
# if i > 0:
# dataset = Dataset.from_dict(dataset[:50])
# dataset = dataset.map(add_set(i))
num_samples_to_drop = int(0.03 * len(dataset))

# Use dataset.select to drop the first 3% of the dataset
dataset = dataset.select(list(range(num_samples_to_drop, len(dataset))))

datasets.append(dataset)
# i += 1
return CombinedDataset(datasets, seed, weights)


def make_supervised_data_module(data_args: DataArguments, seed=42) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
train_dataset = load_dataset(data_args.data_path, data_args.data_weights, data_args.train_split, seed)
eval_dataset = load_dataset(data_args.data_path, data_args.data_weights, data_args.eval_split, seed)
data_collator = DataCollatorForSupervisedDataset()
return {
"train_dataset": train_dataset,
"eval_dataset": eval_dataset,
"data_collator": data_collator,
}


def train():
parser = transformers.HfArgumentParser(
(ModelArguments, DataArguments, TrainingArguments)
)
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

# if training_args.checkpoint is not None:
# model_args.model_name_or_path = training_args.checkpoint
# else:
# training_args.checkpoint = False


model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
use_flash_attention_2=model_args.use_flash_attention_2
)

if model_args.tokenizer_name_or_path is None:
model_args.tokenizer_name_or_path = model_args.model_name_or_path

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.tokenizer_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
)

# if tokenizer is not None and model.vocab_size != len(tokenizer):
# model.resize_token_embeddings(len(tokenizer))

data_module = make_supervised_data_module(
data_args=data_args, seed=training_args.data_seed
)
trainer = Trainer(
model=model, tokenizer=tokenizer, args=training_args, **data_module
)
# if training_args.checkpoint is not None:
# trainer.load_model(training_args.checkpoint)
trainer.train(training_args.checkpoint)
# trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)


if __name__ == "__main__":
train()
30 changes: 30 additions & 0 deletions experiment_effect_of_pre-train/update_ModelVocab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import argparse
import transformers

def main(args):
# Load model
model = transformers.AutoModelForCausalLM.from_pretrained(args.model_name_or_path)

# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
args.tokenizer_name_or_path,
padding_side="right",
use_fast=False,
)

# Resize token embeddings if necessary
if tokenizer is not None and model.config.vocab_size != len(tokenizer):
model.resize_token_embeddings(len(tokenizer))

# Save model and tokenizer
model.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Save model and tokenizer.")
parser.add_argument("--model_name_or_path", type=str, help="Path or name of the pre-trained model.")
parser.add_argument("--tokenizer_name_or_path", type=str, help="Path or name of the pre-trained tokenizer.")
parser.add_argument("--output_dir", type=str, help="Directory where the model and tokenizer will be saved.")

args = parser.parse_args()
main(args)
Loading