reslove_conflict

OpenThaiGPT · Feb 15, 2024 · 292e0ba · 292e0ba
2 parents 50ea9ae + f4932b0
commit 292e0ba
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 0 deletions.
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
@@ -0,0 +1,7 @@
+HUGGINGFACE_DATASET_SRC_PATH=/workspace/datset1
+JSONL_DATASET_SAVE_PATH=/workspace/datset2
+
+
+python scripts/prepare_hf_datasets.py \
+   $HUGGINGFACE_DATASET_SRC_PATH \
+   $JSONL_DATASET_SAVE_PATH
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
@@ -0,0 +1,27 @@
+module purge
+source /opt/cray/pe/cpe/23.09/restore_lmod_system_defaults.sh
+module load Miniconda3
+module load cudatoolkit/23.3_11.8
+module load PrgEnv-gnu
+module load cpe-cuda
+
+conda deactivate
+conda activate /project/lt200056-opgpth/new/TinyLlama_2024/.conda_new
+
+
+SOURCE_DIR=/workspace/source
+TOKENIZER_DIR=/workspace/data
+OUTPUT_DIR=/workspace/output
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split train --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split eval --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR \
+  --chunk_size 524544
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
@@ -0,0 +1,26 @@
+
+
+# source /opt/cray/pje/cpe/23.09/restore_lmod_system_defaults.sh
+module purge
+module load Miniconda3/22.11.1-1
+# module load cpe-cuda/23.03
+module load cudatoolkit/23.3_11.8
+module load gcc/11.2.0
+module load PrgEnv-nvidia
+# module load gcc/11.2
+# module load PrgEnv-gnu
+# module load cpe-cuda
+# module load cudatoolkit/22.7_11.7
+# module load craype-accel-nvidia80
+# module load aws-ofi-nccl
+
+TRAIN_DATA_DIR=/workspace/train
+VAL_DATA_DIR=/workspace/val
+
+export WANDB_MODE=offline
+srun python pretrain/tinyllama.py \
+    --train_data_dir $TRAIN_DATA_DIR \
+    --val_data_dir $VAL_DATA_DIR \
+    --devices 4 \
+    --num_nodes 10 \
+