diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh new file mode 100644 index 0000000..c928d2b --- /dev/null +++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh @@ -0,0 +1,7 @@ +HUGGINGFACE_DATASET_SRC_PATH=/workspace/datset1 +JSONL_DATASET_SAVE_PATH=/workspace/datset2 + + +python scripts/prepare_hf_datasets.py \ + $HUGGINGFACE_DATASET_SRC_PATH \ + $JSONL_DATASET_SAVE_PATH \ No newline at end of file diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh new file mode 100644 index 0000000..9a5e780 --- /dev/null +++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh @@ -0,0 +1,27 @@ +module purge +source /opt/cray/pe/cpe/23.09/restore_lmod_system_defaults.sh +module load Miniconda3 +module load cudatoolkit/23.3_11.8 +module load PrgEnv-gnu +module load cpe-cuda + +conda deactivate +conda activate /project/lt200056-opgpth/new/TinyLlama_2024/.conda_new + + +SOURCE_DIR=/workspace/source +TOKENIZER_DIR=/workspace/data +OUTPUT_DIR=/workspace/output + +python scripts/prepare_openthaigpt.py \ + --source_path $SOURCE_DIR \ + --split train --percentage 1.0 \ + --tokenizer_path $TOKENIZER_DIR \ + --destination_path $OUTPUT_DIR + +python scripts/prepare_openthaigpt.py \ + --source_path $SOURCE_DIR \ + --split eval --percentage 1.0 \ + --tokenizer_path $TOKENIZER_DIR \ + --destination_path $OUTPUT_DIR \ + --chunk_size 524544 \ No newline at end of file diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh new file mode 100644 index 0000000..e1d7c42 --- /dev/null +++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh @@ -0,0 +1,26 @@ + + +# source /opt/cray/pje/cpe/23.09/restore_lmod_system_defaults.sh +module purge +module load Miniconda3/22.11.1-1 +# module load cpe-cuda/23.03 +module load cudatoolkit/23.3_11.8 +module load gcc/11.2.0 +module load PrgEnv-nvidia +# module load gcc/11.2 +# module load PrgEnv-gnu +# module load cpe-cuda +# module load cudatoolkit/22.7_11.7 +# module load craype-accel-nvidia80 +# module load aws-ofi-nccl + +TRAIN_DATA_DIR=/workspace/train +VAL_DATA_DIR=/workspace/val + +export WANDB_MODE=offline +srun python pretrain/tinyllama.py \ + --train_data_dir $TRAIN_DATA_DIR \ + --val_data_dir $VAL_DATA_DIR \ + --devices 4 \ + --num_nodes 10 \ +