diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
new file mode 100644
index 0000000..c928d2b
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
@@ -0,0 +1,7 @@
+HUGGINGFACE_DATASET_SRC_PATH=/workspace/datset1
+JSONL_DATASET_SAVE_PATH=/workspace/datset2
+
+
+python scripts/prepare_hf_datasets.py \
+   $HUGGINGFACE_DATASET_SRC_PATH \
+   $JSONL_DATASET_SAVE_PATH
\ No newline at end of file
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
new file mode 100644
index 0000000..9a5e780
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
@@ -0,0 +1,27 @@
+module purge
+source /opt/cray/pe/cpe/23.09/restore_lmod_system_defaults.sh
+module load Miniconda3
+module load cudatoolkit/23.3_11.8
+module load PrgEnv-gnu
+module load cpe-cuda
+
+conda deactivate
+conda activate /project/lt200056-opgpth/new/TinyLlama_2024/.conda_new
+
+
+SOURCE_DIR=/workspace/source
+TOKENIZER_DIR=/workspace/data
+OUTPUT_DIR=/workspace/output
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split train --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split eval --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR \
+  --chunk_size 524544
\ No newline at end of file
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
new file mode 100644
index 0000000..e1d7c42
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
@@ -0,0 +1,26 @@
+
+
+# source /opt/cray/pje/cpe/23.09/restore_lmod_system_defaults.sh
+module purge
+module load Miniconda3/22.11.1-1
+# module load cpe-cuda/23.03
+module load cudatoolkit/23.3_11.8
+module load gcc/11.2.0
+module load PrgEnv-nvidia
+# module load gcc/11.2
+# module load PrgEnv-gnu
+# module load cpe-cuda
+# module load cudatoolkit/22.7_11.7
+# module load craype-accel-nvidia80
+# module load aws-ofi-nccl
+
+TRAIN_DATA_DIR=/workspace/train
+VAL_DATA_DIR=/workspace/val
+
+export WANDB_MODE=offline
+srun python pretrain/tinyllama.py \
+    --train_data_dir $TRAIN_DATA_DIR \
+    --val_data_dir $VAL_DATA_DIR \
+    --devices 4 \
+    --num_nodes 10 \
+