-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02.giamerge
executable file
·61 lines (49 loc) · 1.41 KB
/
02.giamerge
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
set -euo pipefail
ulimit -n 2048
export JOB_LIST=$1
export L=$2
export OUTPUT_DIR=$3
function merge_shard() {
set -euo pipefail
local SHARD=$1
# Find all paths to all batches for shard $SHARD across all job partitions
for i in $(seq ${JOB_LIST/-/ }); do
# Shards may not exist (because they're completely empty)
if [ -d "${OUTPUT_DIR}/${L}.$(printf '%03d' $i)/${SHARD}/" ]; then
batches=(
${batches[@]:-}
$(ls -d ${OUTPUT_DIR}/${L}.$(printf '%03d' $i)/${SHARD}/*/)
)
fi
done
# Completely empty output shard? Make the output folder and be done.
if [ -z "${batches:-}" ]; then
printf '%d is an empty shard\n' $SHARD
mkdir -p ${OUTPUT_DIR}/${L}/${SHARD}
return
fi
printf '%d batches:\n' "${#batches[@]}"
printf '%s\n' "${batches[@]}"
mkdir -p ${OUTPUT_DIR}/${L}/${SHARD}~$$
batch_dedupe \
--verbose \
--bytes $(( 3 * 1024 * 1024 * 1024 )) \
--limit 4096 \
--combined source.gz url.gz \
--derived plain_text.gz \
--unique plain_text.gz \
--output ${OUTPUT_DIR}/${L}/${SHARD}~$$ \
"${batches[@]}"
# Move to permanent position
mv ${OUTPUT_DIR}/${L}/${SHARD}{~$$,}
}
export -f merge_shard
SHARDS_PER_TASK=${SHARDS_PER_TASK:-16}
SHARD_END=$((SLURM_ARRAY_TASK_ID * SHARDS_PER_TASK))
SHARD_START=$((SHARD_END - SHARDS_PER_TASK))
parallel \
-j${SLURM_TASKS_PER_NODE} \
--line-buffer \
--tagstring '[{}]' \
merge_shard '{}' ::: $(seq $SHARD_START $((SHARD_END - 1)))