From f28e5653fc7771885c86ae718ae0783ea57edc6e Mon Sep 17 00:00:00 2001
From: unknown <chanon.utupon@gmail.com>
Date: Fri, 16 Feb 2024 01:23:07 +0700
Subject: [PATCH 1/3] readme_exp4

---
 experiment_effect_of_dpo/README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 experiment_effect_of_dpo/README.md

diff --git a/experiment_effect_of_dpo/README.md b/experiment_effect_of_dpo/README.md
new file mode 100644
index 0000000..951e37d
--- /dev/null
+++ b/experiment_effect_of_dpo/README.md
@@ -0,0 +1,19 @@
+## Train DPO
+
+```bash
+002-001-dpo-temp-0_3-v-all-ref.sh
+```
+
+### Configuration
+
+- BASE_MODEL: Name of Model for save.
+- DATA_PATH: Dataset Path.
+- EPOCH: Num Train Epoch.
+- LR: 2e-5 for full finetune and 2e-4 for lora.
+- GRADIENT_ACCUMULATION_STEPS: Accumulation step.
+- MAX_LEN: Max training length.
+- MAX_PROMPT_LEN: Max training prompt length.
+- MICRO_BSZ: Batch size per step.
+- VAL_SIZE: Split validation set.
+- WANDB_NAME: Wandb project name.
+- WARMUP_STEPS: Warmup step for scheduler.
\ No newline at end of file

From 25f09e14f03c7a051d12454f75d4e20fd970d802 Mon Sep 17 00:00:00 2001
From: Konthee <konthee1995@hotnmail.com>
Date: Fri, 16 Feb 2024 01:27:59 +0700
Subject: [PATCH 2/3] add Ex2

---
 .../README.md                                 |  1 +
 .../step1_1submit_data_hf_openthai.sh         |  7 +++++
 .../step1_2submit_data_openthai.sh            | 27 +++++++++++++++++++
 .../train_tiny_llama/step2_submit_train.sh    | 26 ++++++++++++++++++
 experiment_effect_of_dpo/README.md            |  1 +
 experiment_effect_of_pre-train/README.md      |  1 +
 6 files changed, 63 insertions(+)
 create mode 100644 experiment_effect_of_data_preprocessing/README.md
 create mode 100644 experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
 create mode 100644 experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
 create mode 100644 experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
 create mode 100644 experiment_effect_of_dpo/README.md
 create mode 100644 experiment_effect_of_pre-train/README.md

diff --git a/experiment_effect_of_data_preprocessing/README.md b/experiment_effect_of_data_preprocessing/README.md
new file mode 100644
index 0000000..1983267
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/README.md
@@ -0,0 +1 @@
+## Experiment 1 
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
new file mode 100644
index 0000000..c928d2b
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_1submit_data_hf_openthai.sh
@@ -0,0 +1,7 @@
+HUGGINGFACE_DATASET_SRC_PATH=/workspace/datset1
+JSONL_DATASET_SAVE_PATH=/workspace/datset2
+
+
+python scripts/prepare_hf_datasets.py \
+   $HUGGINGFACE_DATASET_SRC_PATH \
+   $JSONL_DATASET_SAVE_PATH
\ No newline at end of file
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
new file mode 100644
index 0000000..9a5e780
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step1_2submit_data_openthai.sh
@@ -0,0 +1,27 @@
+module purge
+source /opt/cray/pe/cpe/23.09/restore_lmod_system_defaults.sh
+module load Miniconda3
+module load cudatoolkit/23.3_11.8
+module load PrgEnv-gnu
+module load cpe-cuda
+
+conda deactivate
+conda activate /project/lt200056-opgpth/new/TinyLlama_2024/.conda_new
+
+
+SOURCE_DIR=/workspace/source
+TOKENIZER_DIR=/workspace/data
+OUTPUT_DIR=/workspace/output
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split train --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR
+
+python scripts/prepare_openthaigpt.py \
+  --source_path $SOURCE_DIR \
+  --split eval --percentage 1.0 \
+  --tokenizer_path $TOKENIZER_DIR \
+  --destination_path $OUTPUT_DIR \
+  --chunk_size 524544
\ No newline at end of file
diff --git a/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
new file mode 100644
index 0000000..e1d7c42
--- /dev/null
+++ b/experiment_effect_of_data_preprocessing/train_tiny_llama/step2_submit_train.sh
@@ -0,0 +1,26 @@
+
+
+# source /opt/cray/pje/cpe/23.09/restore_lmod_system_defaults.sh
+module purge
+module load Miniconda3/22.11.1-1
+# module load cpe-cuda/23.03
+module load cudatoolkit/23.3_11.8
+module load gcc/11.2.0
+module load PrgEnv-nvidia
+# module load gcc/11.2
+# module load PrgEnv-gnu
+# module load cpe-cuda
+# module load cudatoolkit/22.7_11.7
+# module load craype-accel-nvidia80
+# module load aws-ofi-nccl
+
+TRAIN_DATA_DIR=/workspace/train
+VAL_DATA_DIR=/workspace/val
+
+export WANDB_MODE=offline
+srun python pretrain/tinyllama.py \
+    --train_data_dir $TRAIN_DATA_DIR \
+    --val_data_dir $VAL_DATA_DIR \
+    --devices 4 \
+    --num_nodes 10 \
+
diff --git a/experiment_effect_of_dpo/README.md b/experiment_effect_of_dpo/README.md
new file mode 100644
index 0000000..1983267
--- /dev/null
+++ b/experiment_effect_of_dpo/README.md
@@ -0,0 +1 @@
+## Experiment 1 
diff --git a/experiment_effect_of_pre-train/README.md b/experiment_effect_of_pre-train/README.md
new file mode 100644
index 0000000..1983267
--- /dev/null
+++ b/experiment_effect_of_pre-train/README.md
@@ -0,0 +1 @@
+## Experiment 1 

From 27a3df3c886f83f41951ce2fe4f8e91e579c47b6 Mon Sep 17 00:00:00 2001
From: Konthee <konthee1995@hotnmail.com>
Date: Fri, 16 Feb 2024 01:31:44 +0700
Subject: [PATCH 3/3] add Ex2

---
 experiment_effect_of_dpo/README.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/experiment_effect_of_dpo/README.md b/experiment_effect_of_dpo/README.md
index 1983267..951e37d 100644
--- a/experiment_effect_of_dpo/README.md
+++ b/experiment_effect_of_dpo/README.md
@@ -1 +1,19 @@
-## Experiment 1 
+## Train DPO
+
+```bash
+002-001-dpo-temp-0_3-v-all-ref.sh
+```
+
+### Configuration
+
+- BASE_MODEL: Name of Model for save.
+- DATA_PATH: Dataset Path.
+- EPOCH: Num Train Epoch.
+- LR: 2e-5 for full finetune and 2e-4 for lora.
+- GRADIENT_ACCUMULATION_STEPS: Accumulation step.
+- MAX_LEN: Max training length.
+- MAX_PROMPT_LEN: Max training prompt length.
+- MICRO_BSZ: Batch size per step.
+- VAL_SIZE: Split validation set.
+- WANDB_NAME: Wandb project name.
+- WARMUP_STEPS: Warmup step for scheduler.
\ No newline at end of file