- ASR: https://huggingface.co/thanhduycao/wav2vec2-large-finetune-aug-on-fly-synthesis-fix-60-epoch-v2/ (checkpoint info(revision code) in predict.sh)
- Spoken-norm: https://huggingface.co/linhtran92/finetuned_taggenv2_55epoch_encoder_embeddings
- NLU: file JointIDSF_PhoBERTencoder.zip in folder training/soict_hackathon_JointIDSF/
-
Download JointIDSF model and move it to the folder training/soict_hackathon_JointIDSF/
-
Link to the model zip: https://drive.google.com/drive/folders/1SXvzXiHb-0OI4c7PfYpfmxO_oVQxO-s-?usp=sharing
#set up requirements
chmod +x scripts/run_commands.sh
scripts/run_commands.sh
chmod +x scripts/predict.sh
scripts/predict.sh
The results will be in folder training/soict_hackathon_JointIDSF/ under file name "predictions.jsonl"
More training instructions details are in README.md of this folder
cd training/ASR-Wav2vec-Finetune
chmod +x asr_train.sh
./asr_train.sh
cd ../..
More training instructions details are in README.md of this folder
cd training/norm-tuned
chmod +x norm_train.sh
./norm_train.sh
cd ../..
More training instructions details are in README.md of this folder
cd training
chmod 755 -R soict_hackathon_JointIDSF
cd soict_hackathon_JointIDSF
#(important)
# before running nlu_train.sh, make sure to delete "rm -rf models",
# and delete "rm -rf data_aug_full_0919_22" if these folders exist
!rm -rf models/
!rm -rf data_aug_full_0919_22/
chmod +x nlu_train.sh
./nlu_train.sh
cd ../..
cd synthesis-data-for-ASR
pip install -r requirements.txt
CUDA_VISIBLE_DEVICES=0 python create_transcription_wer.py --data_links="thanhduycao/soict_train_dataset" --output_path="thanhduycao/soict_train_dataset_with_wer_validate" --token="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" --num_workers=2
CUDA_VISIBLE_DEVICES=0 python lyric-alignment/predict.py --data_links="thanhduycao/soict_train_dataset_with_wer_validate" --output_path="thanhduycao/data_for_synthesis_with_entities_align_v5_validate" --token="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" --num_workers=4
CUDA_VISIBLE_DEVICES=0 python create_entity_dataset.py --data_links="thanhduycao/data_for_synthesis_with_entities_align_v5_validate" --output_path="thanhduycao/data_for_synthesis_entities_validate" --token="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" --num_workers=1
CUDA_VISIBLE_DEVICES=0 python create_synthesis_dataset.py --data_links="thanhduycao/data_for_synthesis_with_entities_align_v5_validate" --output_path="thanhduycao/data_synthesis_validate" --token="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" --num_workers=1