-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare-wmt14en2dev2.sh
31 lines (26 loc) · 1010 Bytes
/
prepare-wmt14en2dev2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# --
mkdir data
cd data
wget http://dl.fbaipublicfiles.com/nat/original_dataset.zip
wget http://dl.fbaipublicfiles.com/nat/distill_dataset.zip
unzip original_dataset.zip
unzip distill_dataset.zip
# rename
for cl in en de; do
cp wmt14_ende/train.en-de.${cl} train.${cl}
cp wmt14_ende_distill/train.en-de.${cl} trainD.${cl}
cp wmt14_ende/valid.en-de.${cl} dev.${cl}
cp wmt14_ende/test.en-de.${cl} test.${cl}
done
# split for multi-gpu training
python3 ../src/scripts/shuffle_and_split.py input_paths:train.en,train.de,trainD.en,trainD.de output_paths:wmt14en2de.train.en,wmt14en2de.train.de,wmt14en2de.train.enD,wmt14en2de.train.deD
for wset in dev test; do
for cl in en de; do
cp ${wset}.${cl} wmt14en2de.${wset}.${cl}
done
done
# convert vocab
mkdir -p voc_wmt14en2de
python3 ../src/scripts/convert_vocab.py wmt14_ende/dict.en.txt voc_wmt14en2de/v_enc.pkl
python3 ../src/scripts/convert_vocab.py wmt14_ende/dict.de.txt voc_wmt14en2de/v_slm.pkl
cp voc_wmt14en2de/v_slm.pkl voc_wmt14en2de/v_ilm.pkl