-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathaws_backup.sh
executable file
·365 lines (271 loc) · 12.2 KB
/
aws_backup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# This script create two tarballs for each plate of data.
# The first tarball (with suffix _images_illum_analysis) contains
# - images (typically ~230Gb)
# - illumination functions (tiny)
# - CellProfiler measurements as CSV files (typically ~30Gb)
#
# The second tarball (with suffix _backend) contains
# - SQLite backend created by ingesting the CSV files (typically ~24Gb)
# - CSV and GCT files created by processing the SQLite backend (tiny)
#
# The tar balls (and their corresponding md5 files) are stored at this location on the "cold" bucket (e.g. imaging-platform-cold)
#
# s3://imaging-platform-cold/imaging_analysis/<PROJECT_NAME>/plates/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_images_illum_analysis.tar.gz
# s3://imaging-platform-cold/imaging_analysis/<PROJECT_NAME>/plates/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_backend.tar.gz
# s3://imaging-platform-cold/imaging_analysis/<PROJECT_NAME>/plates/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_images_illum_analysis.md5
# s3://imaging-platform-cold/imaging_analysis/<PROJECT_NAME>/plates/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_backend.md5
#
# The file listing of the contents of both tarballs (as they existed on S3) are stored at this location on the "cold" bucket
#
# s3://imaging-platform-cold/imaging_analysis/<PROJECT_NAME>/plates/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_file_listing_untrimmed_s3.txt
#
# e.g.
# .
# └── imaging-platform-cold
# └── imaging_analysis
# └── 2013_Gustafsdottir_PLOSONE
# └── plates
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20586_file_listing_untrimmed_s3.txt
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20586_backend.md5
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20586_backend.tar.gz
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20586_images_illum_analysis.md5
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20586_images_illum_analysis.tar.gz
# ├── ...
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20589_file_listing_untrimmed_s3.txt
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20589_backend.md5
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20589_backend.tar.gz
# ├── 2013_Gustafsdottir_PLOSONE_BBBC022_20589_images_illum_analysis.md5
# └── 2013_Gustafsdottir_PLOSONE_BBBC022_20589_images_illum_analysis.tar.gz
#
# Additionally, the following 3 files are stored in the "live" bucket (e.g. imaging-platform)
#
# s3://imaging-platform/projects/<PROJECT_NAME>/workspace/backup/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_file_listing_s3.txt
# s3://imaging-platform/projects/<PROJECT_NAME>/workspace/backup/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_file_listing_tar.txt
# s3://imaging-platform/projects/<PROJECT_NAME>/workspace/backup/<PROJECT_NAME>_<BATCH_ID>_<PLATE_ID>_delete_s3.sh
#
# The first two are file listings of the archives as they exist on S3 and in the tarball respectively. They have been
# formatted so that they can be compared via diff or by their ETag.
#
# The third file has a list of awscli commands to delete the files that have been archived by this process.
#
# When 2013_Gustafsdottir_PLOSONE_BBBC022_20586_*.tar.gz files are unzipped like this,
#
# tar xzf 2013_Gustafsdottir_PLOSONE_BBBC022_20586_images_illum_analysis.tar.gz --strip-components=1
# tar xzf 2013_Gustafsdottir_PLOSONE_BBBC022_20586_backend.tar.gz --strip-components=1
#
# the directory structure will look like this
# .
# └── 2013_Gustafsdottir_PLOSONE
# ├── BBBC022
# │ ├── illum
# │ │ └── 20586
# │ └── images
# │ └── 20586
# └── workspace
# ├── analysis
# │ └── BBBC022
# │ └── 20586
# └── backend
# └── BBBC022
# └── 20586
# Example usage:
#
# ./aws_backup.sh \
# --project_name 2013_Gustafsdottir_PLOSONE \
# --batch_id BBBC022 \
# --plate_id_full "20586" \
# --plate_id 20586 \
# --tmpdir ~/ebs_tmp
#
# Note: In the example above, `plate_id` and `plate_id_full` are the same but this is not always true.
# E.g. The `plate_id` for "BR00092655__2017-12-10T12_48_16-Measurement 1" is "BR00092655"
progname=`basename $0`
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--project_name)
project_name="$2"
shift
;;
--batch_id)
batch_id="$2"
shift
;;
--plate_id_full)
plate_id_full="$2"
shift
;;
--plate_id)
plate_id="$2"
shift
;;
--bucket)
bucket="$2"
shift
;;
--cold_bucket)
cold_bucket="$2"
shift
;;
-t|--tmpdir)
tmpdir="$2"
shift
;;
*)
echo "unknown option"
;;
esac
shift
done
# project_name=2013_Gustafsdottir_PLOSONE
# batch_id=BBBC022
# plate_id_full="20586"
# plate_id=20586
# tmpdir=~/ebs_tmp
bucket="${bucket:-imaging-platform}"
cold_bucket="${cold_bucket:-imaging-platform-cold}"
tmpdir="${tmpdir:-/tmp}"
s3_prefix=s3://${bucket}/projects/${project_name}
s3_backup_prefix=s3://${bucket}/projects/${project_name}/workspace/backup
s3_cold_prefix=s3://${cold_bucket}/imaging_analysis/${project_name}/plates
s3_cold_prefix_key=imaging_analysis/${project_name}/plates
plate_archive_tag=${project_name}_${batch_id}_${plate_id}
script_dir=$(pwd)
# report sizes
# s3cmd du "${s3_prefix}/${batch_id}/images/${plate_id_full}"
# s3cmd du "${s3_prefix}/${batch_id}/illum/${plate_id}"
# s3cmd du "${s3_prefix}/workspace/analysis/${batch_id}/${plate_id}"
# s3cmd du "${s3_prefix}/workspace/backend/${batch_id}/${plate_id}"
#https://stackoverflow.com/questions/19622198/what-does-set-e-mean-in-a-bash-script#comment36826142_19622569
# Exit immediately if a command exits with a non-zero status
trap 'exit' ERR
# create staging directory
cd $tmpdir
mkdir -p ${plate_archive_tag}
cd ${plate_archive_tag}
# create subdirectories
mkdir -p "${project_name}/${batch_id}/images/${plate_id_full}"
mkdir -p "${project_name}/${batch_id}/illum/${plate_id}"
mkdir -p "${project_name}/workspace/analysis/${batch_id}/${plate_id}"
mkdir -p "${project_name}/workspace/backend/${batch_id}/${plate_id}"
cd ${project_name}
# get file listing
file_listing_s3=../../${plate_archive_tag}_file_listing_s3.txt
file_listing_untrimmed_s3=../../${plate_archive_tag}_file_listing_untrimmed_s3.txt
rm -rf ${file_listing_untrimmed_s3}
touch ${file_listing_untrimmed_s3}
# aws s3 ls return 1 if file / prefix doesn't exist
trap '' ERR
aws s3 ls --recursive "${s3_prefix}/${batch_id}/images/${plate_id_full}" >> ${file_listing_untrimmed_s3}
aws s3 ls --recursive "${s3_prefix}/${batch_id}/illum/${plate_id}" >> ${file_listing_untrimmed_s3}
aws s3 ls --recursive "${s3_prefix}/workspace/analysis/${batch_id}/${plate_id}" >> ${file_listing_untrimmed_s3}
aws s3 ls --recursive "${s3_prefix}/workspace/backend/${batch_id}/${plate_id}" >> ${file_listing_untrimmed_s3}
# reset trap to exit
trap 'exit' ERR
# make list of files on S3
cat ${file_listing_untrimmed_s3} | \
awk -F/ '{ if($NF != "") print }' | \
tr -s " " | \
cut -d" " -f3,4 | \
awk '{ print $2 "\t" $1}' | \
sed s,projects/,,g | \
sort > \
${file_listing_s3}
# download data from S3
aws s3 sync "${s3_prefix}/${batch_id}/images/${plate_id_full}" "${batch_id}/images/${plate_id_full}"
aws s3 sync "${s3_prefix}/${batch_id}/illum/${plate_id}" "${batch_id}/illum/${plate_id}"
aws s3 sync "${s3_prefix}/workspace/analysis/${batch_id}/${plate_id}" "workspace/analysis/${batch_id}/${plate_id}"
aws s3 sync "${s3_prefix}/workspace/backend/${batch_id}/${plate_id}" "workspace/backend/${batch_id}/${plate_id}"
# TODO Do checks to ensure transfer happened after each of the steps above
cd ../../
function verify_object_does_not_exist {
s3_location=$1
xbucket=$(python -c "from urlparse import urlparse; import sys; o = urlparse(sys.argv[1]); print(o.netloc)" ${s3_location})
xkey=$(python -c "from urlparse import urlparse; import sys; o = urlparse(sys.argv[1]); print(o.path).lstrip('/')" ${s3_location})
trap '' ERR
aws s3api head-object --bucket ${xbucket} --key ${xkey} 2> /dev/null
exit_code=$?
trap 'exit' ERR
if [ "$exit_code" == "0" ]; then
echo "Object already exists: " ${s3_location}
exit 1
fi
}
function process_tar_file {
archive_location=${s3_cold_prefix}/${tar_file}.tar.gz
# check whether file already exits, in which case, error out
verify_object_does_not_exist ${archive_location}
tar_file=$1
file_listing_tar=${tar_file}_file_listing_tar.txt
tar -tvzf ${tar_file}.tar.gz | \
awk -F/ '{ if($NF != "") print }' | \
tr -s " " | \
cut -f3,6 -d" " | \
awk '{ print $2 "\t" $1 }' | \
sed s,${plate_archive_tag}/,,g | \
sort > \
${file_listing_tar}
# calculate md5
md5sum ${tar_file}.tar.gz > ${tar_file}.md5
# copy to S3
aws s3 cp ${tar_file}.tar.gz ${archive_location}
# check whether local ETag and remote Etag match
# https://github.com/antespi/s3md5
size=$(du -b ${tar_file}.tar.gz | cut -f1)
# if size is less than or equal to 8Mb, then ETag is same as MD5
if [ ${size} -le 8388608 ] ; then
etag_local=$(cat ${tar_file}.md5 | cut -d" " -f1)
else
multipart_chunksize=$(${script_dir}/get_multipart_chunksize ${tar_file}.tar.gz)
etag_local=$(${script_dir}/s3md5 ${multipart_chunksize} ${tar_file}.tar.gz)
fi
etag_remote=$(aws s3api head-object --bucket ${cold_bucket} --key ${s3_cold_prefix_key}/${tar_file}.tar.gz |jq '.ETag' -|tr -d '"'|tr -d '\\')
if [ "$etag_local" != "$etag_remote" ]; then
echo "Remote and local ETags don't match"
echo "Remote =" $etag_remote
echo "Local =" $etag_local
exit 1
fi
echo ${etag_remote} > ${tar_file}.etag
# copy md5 and etag to remote
aws s3 cp ${tar_file}.md5 ${s3_cold_prefix}/${tar_file}.md5
aws s3 cp ${tar_file}.etag ${s3_cold_prefix}/${tar_file}.etag
# remove local cache of tarball and md5 and etag
rm -rf ${tar_file}.tar.gz ${tar_file}.md5 ${tar_file}.etag
}
# create tarball for images, illum, analysis folders
tar_file=${plate_archive_tag}_images_illum_analysis
tar -czf ${tar_file}.tar.gz \
"${plate_archive_tag}/${project_name}/${batch_id}/images/${plate_id_full}" \
"${plate_archive_tag}/${project_name}/${batch_id}/illum/${plate_id}" \
"${plate_archive_tag}/${project_name}/workspace/analysis/${batch_id}/${plate_id}"
process_tar_file ${tar_file}
file_listing_tar_1=${tar_file}_file_listing_tar.txt
# create tarball for backend folders
tar_file=${plate_archive_tag}_backend
tar -czf ${tar_file}.tar.gz \
"${plate_archive_tag}/${project_name}/workspace/backend/${batch_id}/${plate_id}"
process_tar_file ${tar_file}
file_listing_tar_2=${tar_file}_file_listing_tar.txt
# create combined file listings
cat ${file_listing_tar_1} ${file_listing_tar_2} | sort > ${plate_archive_tag}_file_listing_tar.txt
rm ${file_listing_tar_1} ${file_listing_tar_2}
aws s3 cp ${plate_archive_tag}_file_listing_tar.txt ${s3_backup_prefix}/${plate_archive_tag}_file_listing_tar.txt
rm ${plate_archive_tag}_file_listing_tar.txt
aws s3 cp ${plate_archive_tag}_file_listing_s3.txt ${s3_backup_prefix}/${plate_archive_tag}_file_listing_s3.txt
rm ${plate_archive_tag}_file_listing_s3.txt
aws s3 cp ${plate_archive_tag}_file_listing_untrimmed_s3.txt ${s3_cold_prefix}/${plate_archive_tag}_file_listing_untrimmed_s3.txt
rm ${plate_archive_tag}_file_listing_untrimmed_s3.txt
# remove downloaded files
rm -rf ${plate_archive_tag}
# create script to delete files from S3 (but don't actually delete them)
delete_s3=${plate_archive_tag}_delete_s3.sh
rm -rf ${delete_s3}
touch ${delete_s3}
echo aws s3 rm --recursive "\"${s3_prefix}/${batch_id}/images/${plate_id_full}\"" >> ${delete_s3}
echo aws s3 rm --recursive "\"${s3_prefix}/${batch_id}/illum/${plate_id}\"" >> ${delete_s3}
echo aws s3 rm --recursive "\"${s3_prefix}/workspace/analysis/${batch_id}/${plate_id}\"" >> ${delete_s3}
echo aws s3 rm --recursive "\"${s3_prefix}/workspace/backend/${batch_id}/${plate_id}\"" >> ${delete_s3}
aws s3 cp ${delete_s3} ${s3_backup_prefix}/${delete_s3}
rm ${delete_s3}