From ee0c0766355c80d4ad0b133086a09a7e0a2fc16c Mon Sep 17 00:00:00 2001 From: SwanBlanc Date: Wed, 9 Aug 2023 08:33:09 +0200 Subject: [PATCH 1/2] edit doc --- model/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/README.md b/model/README.md index c03aaf83c3..aef75e3e01 100644 --- a/model/README.md +++ b/model/README.md @@ -72,7 +72,7 @@ cd model_training # export shared modules export PYTHONPATH=$PYTHONPATH:../../oasst-shared -python trainer_sft.py --configs defaults oa_dataset_only pythia --cache_dir $DATA_PATH --output_dir $MODEL_PATH/sft_model +python trainer_sft.py --configs defaults oasst_only pythia-70m-deduped --cache_dir $DATA_PATH --output_dir $MODEL_PATH/sft_model # if you want to use wandb, add --wandb_entity your_username/team_name From 9775d717b92b6cfe25e71f422ff68f95c471e838 Mon Sep 17 00:00:00 2001 From: SwanBlanc Date: Wed, 9 Aug 2023 12:31:01 +0200 Subject: [PATCH 2/2] Add multilingual dolly --- .../oa_dolly_15k_multilingual/README.md | 1 + .../create_dataset.py | 55 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 data/datasets/oa_dolly_15k_multilingual/README.md create mode 100644 data/datasets/oa_dolly_15k_multilingual/create_dataset.py diff --git a/data/datasets/oa_dolly_15k_multilingual/README.md b/data/datasets/oa_dolly_15k_multilingual/README.md new file mode 100644 index 0000000000..2085319e8a --- /dev/null +++ b/data/datasets/oa_dolly_15k_multilingual/README.md @@ -0,0 +1 @@ +Reformat [argilla/databricks-dolly-15k-curated-multilingual](https://huggingface.co/datasets/argilla/databricks-dolly-15k-curated-multilingual) dataset \ No newline at end of file diff --git a/data/datasets/oa_dolly_15k_multilingual/create_dataset.py b/data/datasets/oa_dolly_15k_multilingual/create_dataset.py new file mode 100644 index 0000000000..77725c182f --- /dev/null +++ b/data/datasets/oa_dolly_15k_multilingual/create_dataset.py @@ -0,0 +1,55 @@ +import json +from pathlib import Path + +from datasets import Dataset, load_dataset + + +def build_dataset(data_file: str, include_context: bool = True) -> Dataset: + json_data = [ + to_oa_format(json.loads(line), include_context=include_context) + for line in Path(data_file).read_text().splitlines() + ] + + dataset = Dataset.from_list(json_data) + return dataset + + +def to_oa_format(data: dict, include_context: bool = True) -> dict: + output_data = { + "INSTRUCTION": data["instruction"], + "RESPONSE": data["response"], + "SOURCE": "databricks-dolly-15k", + "METADATA": { + "CATEGORY": data["category"], + }, + } + + if include_context: + output_data["METADATA"]["CONTEXT"] = data["context"] + + return output_data + + +def main(): + dataset = load_dataset("argilla/databricks-dolly-15k-curated-multilingual") + json_data = [] + for lang, data in dataset.items(): + for row in data: + json_data.append({ + "INSTRUCTION": row["instruction"], + "INSTRUCTION_EN": row["instruction_original_en"], + "RESPONSE_EN": row["response_original_en"], + "RESPONSE": row["response"], + "SOURCE": "databricks-dolly-15k-curated-multilingual", + "METADATA": { + "CATEGORY": row["category"], + "CONTEXT": row["context"], + "LANG": lang + }, + }) + format_dataset = Dataset.from_list(json_data) + format_dataset.push_to_hub("blancsw/oa_dolly_15k_multilingual") + + +if __name__ == "__main__": + main()