diff --git a/data/datasets/oa_dolly_15k_multilingual/README.md b/data/datasets/oa_dolly_15k_multilingual/README.md new file mode 100644 index 0000000000..2085319e8a --- /dev/null +++ b/data/datasets/oa_dolly_15k_multilingual/README.md @@ -0,0 +1 @@ +Reformat [argilla/databricks-dolly-15k-curated-multilingual](https://huggingface.co/datasets/argilla/databricks-dolly-15k-curated-multilingual) dataset \ No newline at end of file diff --git a/data/datasets/oa_dolly_15k_multilingual/create_dataset.py b/data/datasets/oa_dolly_15k_multilingual/create_dataset.py new file mode 100644 index 0000000000..77725c182f --- /dev/null +++ b/data/datasets/oa_dolly_15k_multilingual/create_dataset.py @@ -0,0 +1,55 @@ +import json +from pathlib import Path + +from datasets import Dataset, load_dataset + + +def build_dataset(data_file: str, include_context: bool = True) -> Dataset: + json_data = [ + to_oa_format(json.loads(line), include_context=include_context) + for line in Path(data_file).read_text().splitlines() + ] + + dataset = Dataset.from_list(json_data) + return dataset + + +def to_oa_format(data: dict, include_context: bool = True) -> dict: + output_data = { + "INSTRUCTION": data["instruction"], + "RESPONSE": data["response"], + "SOURCE": "databricks-dolly-15k", + "METADATA": { + "CATEGORY": data["category"], + }, + } + + if include_context: + output_data["METADATA"]["CONTEXT"] = data["context"] + + return output_data + + +def main(): + dataset = load_dataset("argilla/databricks-dolly-15k-curated-multilingual") + json_data = [] + for lang, data in dataset.items(): + for row in data: + json_data.append({ + "INSTRUCTION": row["instruction"], + "INSTRUCTION_EN": row["instruction_original_en"], + "RESPONSE_EN": row["response_original_en"], + "RESPONSE": row["response"], + "SOURCE": "databricks-dolly-15k-curated-multilingual", + "METADATA": { + "CATEGORY": row["category"], + "CONTEXT": row["context"], + "LANG": lang + }, + }) + format_dataset = Dataset.from_list(json_data) + format_dataset.push_to_hub("blancsw/oa_dolly_15k_multilingual") + + +if __name__ == "__main__": + main() diff --git a/model/README.md b/model/README.md index c03aaf83c3..aef75e3e01 100644 --- a/model/README.md +++ b/model/README.md @@ -72,7 +72,7 @@ cd model_training # export shared modules export PYTHONPATH=$PYTHONPATH:../../oasst-shared -python trainer_sft.py --configs defaults oa_dataset_only pythia --cache_dir $DATA_PATH --output_dir $MODEL_PATH/sft_model +python trainer_sft.py --configs defaults oasst_only pythia-70m-deduped --cache_dir $DATA_PATH --output_dir $MODEL_PATH/sft_model # if you want to use wandb, add --wandb_entity your_username/team_name