-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathsplit_and_convert_dpr_queries.sh
executable file
·94 lines (75 loc) · 2.53 KB
/
split_and_convert_dpr_queries.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/bin/bash -e
# This script:
# 1. splits/copies downloaded Facebook DPR queries. The result is stored in the raw-input sub-directory
# the queries are split into three parts: bitext, train_fusion, and development queries.
# It is possible to specify partition sizes.
# 2. then it converts all the queries
#
source ./config.sh
source ./common_proc.sh
checkVarNonEmpty "COLLECT_ROOT"
checkVarNonEmpty "INPUT_DATA_SUBDIR"
checkVarNonEmpty "INPUT_RAW_SUBDIR"
checkVarNonEmpty "BITEXT_SUBDIR"
checkVarNonEmpty "QUESTION_FILE_JSONL"
checkVarNonEmpty "QREL_FILE"
checkVarNonEmpty "DERIVED_DATA_SUBDIR"
boolOpts=("h" "help" "print help")
partNames="bitext,train_fusion,dev"
partSizes=",2500,2500"
seed=0
paramOpts=(
"seed" "seed" "random seed, default ($seed)"
"partition_sizes" "partSizes" "sizes for partitions $partNames, empty means all remaining, default: $partSizes"
)
parseArguments $@
usageMain="<$SAMPLE_COLLECT_ARG> <download directory> <collection type>"
if [ "$help" = "1" ] ; then
genUsage $usageMain
exit 1
fi
collect=${posArgs[0]}
if [ "$collect" = "" ] ; then
genUsage "$usageMain" "Specify $SAMPLE_COLLECT_ARG (1st arg)"
exit 1
fi
downloadDir=${posArgs[1]}
if [ "$downloadDir" = "" ] ; then
genUsage "$usageMain" "Specify the download directory (2d arg)"
exit 1
fi
colType=${posArgs[2]}
if [ "$colType" = "" ] ; then
genUsage "$usageMain" "Specify a collection type: nq, trivia, or squad (3d arg)"
exit 1
fi
inputRawDir="$COLLECT_ROOT/$collect/$INPUT_RAW_SUBDIR"
if [ ! -d "$inputRawDir" ] ; then
mkdir -p "$inputRawDir"
fi
cp "$downloadDir/${colType}_dev.json.gz" "$inputRawDir/${colType}_dev_official.json.gz"
./data_convert/wikipedia_dpr/split_dpr_raw_queries.py \
--seed $seed \
--src_file "$downloadDir/${colType}_train.json.gz" \
--partitions_names "$partNames" \
--partitions_sizes "$partSizes" \
--dst_file_pref "$inputRawDir/${colType}"
# Finally convert the queries
for part in bitext train_fusion dev dev_official ; do
outDataDir="$COLLECT_ROOT/$collect/$INPUT_DATA_SUBDIR/$part"
if [ ! -d "$outDataDir" ] ; then
mkdir -p "$outDataDir"
fi
if [ "$part" = "bitext" ] ; then
bitextPathOpt="--out_bitext_path $COLLECT_ROOT/$collect/$DERIVED_DATA_SUBDIR/$BITEXT_SUBDIR"
else
bitextPathOpt=""
fi
./data_convert/wikipedia_dpr/convert_queries.py \
--bert_tokenize \
--input "$inputRawDir/${colType}_${part}.json.gz" \
--part_type $part \
--output_queries "$outDataDir/$QUESTION_FILE_JSONL" \
--output_qrels "$outDataDir/$QREL_FILE" \
$bitextPathOpt
done