-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
60 additions
and
0 deletions.
There are no files selected for viewing
7 changes: 7 additions & 0 deletions
7
experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
HUGGINGFACE_DATASET_SRC_PATH=/workspace/datset1 | ||
JSONL_DATASET_SAVE_PATH=/workspace/datset2 | ||
|
||
|
||
python scripts/prepare_hf_datasets.py \ | ||
$HUGGINGFACE_DATASET_SRC_PATH \ | ||
$JSONL_DATASET_SAVE_PATH |
27 changes: 27 additions & 0 deletions
27
experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
module purge | ||
source /opt/cray/pe/cpe/23.09/restore_lmod_system_defaults.sh | ||
module load Miniconda3 | ||
module load cudatoolkit/23.3_11.8 | ||
module load PrgEnv-gnu | ||
module load cpe-cuda | ||
|
||
conda deactivate | ||
conda activate /project/lt200056-opgpth/new/TinyLlama_2024/.conda_new | ||
|
||
|
||
SOURCE_DIR=/workspace/source | ||
TOKENIZER_DIR=/workspace/data | ||
OUTPUT_DIR=/workspace/output | ||
|
||
python scripts/prepare_openthaigpt.py \ | ||
--source_path $SOURCE_DIR \ | ||
--split train --percentage 1.0 \ | ||
--tokenizer_path $TOKENIZER_DIR \ | ||
--destination_path $OUTPUT_DIR | ||
|
||
python scripts/prepare_openthaigpt.py \ | ||
--source_path $SOURCE_DIR \ | ||
--split eval --percentage 1.0 \ | ||
--tokenizer_path $TOKENIZER_DIR \ | ||
--destination_path $OUTPUT_DIR \ | ||
--chunk_size 524544 |
26 changes: 26 additions & 0 deletions
26
experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
|
||
|
||
# source /opt/cray/pje/cpe/23.09/restore_lmod_system_defaults.sh | ||
module purge | ||
module load Miniconda3/22.11.1-1 | ||
# module load cpe-cuda/23.03 | ||
module load cudatoolkit/23.3_11.8 | ||
module load gcc/11.2.0 | ||
module load PrgEnv-nvidia | ||
# module load gcc/11.2 | ||
# module load PrgEnv-gnu | ||
# module load cpe-cuda | ||
# module load cudatoolkit/22.7_11.7 | ||
# module load craype-accel-nvidia80 | ||
# module load aws-ofi-nccl | ||
|
||
TRAIN_DATA_DIR=/workspace/train | ||
VAL_DATA_DIR=/workspace/val | ||
|
||
export WANDB_MODE=offline | ||
srun python pretrain/tinyllama.py \ | ||
--train_data_dir $TRAIN_DATA_DIR \ | ||
--val_data_dir $VAL_DATA_DIR \ | ||
--devices 4 \ | ||
--num_nodes 10 \ | ||
|