From 934a2b40da0007ce5769d9cf380f080d59f83a3a Mon Sep 17 00:00:00 2001 From: Jover Date: Mon, 18 Oct 2021 16:48:48 -0700 Subject: [PATCH] run-nextclade-full: keep input FASTA compressed Use the `lzma` module to read the compressed input FASTA so that we do not have to take up additional disk space with the uncompressed FASTA. --- bin/run-nextclade-full | 4 ++-- bin/split-fasta | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/run-nextclade-full b/bin/run-nextclade-full index 4acc21bf1..d366a06bd 100755 --- a/bin/run-nextclade-full +++ b/bin/run-nextclade-full @@ -91,7 +91,7 @@ main() { DATE_UTC=$(date -u "+%Y-%m-%d--%H-%M-%S--%Z") S3_DST="$S3_SRC/nextclade-full-run-${DATE_UTC}" - INPUT_FASTA="data/${DATABASE}/sequences.fasta" + INPUT_FASTA="data/${DATABASE}/sequences.fasta.xz" OUTPUT_TSV="data/${DATABASE}/nextclade.tsv" TMP_DIR_FASTA="tmp/${DATABASE}/fasta" TMP_DIR_TSV="tmp/${DATABASE}/clades" @@ -135,7 +135,7 @@ main() { fi echo "[ INFO] ${0}:${LINENO}: Downloading '${S3_SRC}/sequences.fasta.xz' to '${INPUT_FASTA}'" - aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" - | xz -T0 -cdfq >"${INPUT_FASTA}" + aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" "${INPUT_FASTA}" echo "[ INFO] ${0}:${LINENO}: Splitting '${INPUT_FASTA}' into batches of size ${BATCH_SIZE} sequences and storing them in '${INPUT_WILDCARD}'" # Split fasta file to multiple batches diff --git a/bin/split-fasta b/bin/split-fasta index e6d32f292..dd7fcf5e9 100755 --- a/bin/split-fasta +++ b/bin/split-fasta @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse import os +import lzma from Bio import SeqIO @@ -56,7 +57,8 @@ def main(): input_filename = os.path.basename(args.input_file) batch_size = int(args.batch_size) - with open(args.input_file) as f_input: + # Must be in "rt" mode since SeqIO requires FASTA files to be opened in text mode + with lzma.open(args.input_file, "rt") as f_input: record_iter = SeqIO.parse(f_input, file_format) for i, batch in enumerate(batch_iterator(record_iter, batch_size)): filename = os.path.join(args.output_dir, f"{input_filename}.batch-{i:05}.fasta")