Skip to content

Commit

Permalink
run-nextclade-full: keep input FASTA compressed
Browse files Browse the repository at this point in the history
Use the `lzma` module to read the compressed input FASTA so that we do
not have to take up additional disk space with the uncompressed FASTA.
  • Loading branch information
joverlee521 committed Oct 19, 2021
1 parent 01ab999 commit 934a2b4
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
4 changes: 2 additions & 2 deletions bin/run-nextclade-full
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ main() {
DATE_UTC=$(date -u "+%Y-%m-%d--%H-%M-%S--%Z")
S3_DST="$S3_SRC/nextclade-full-run-${DATE_UTC}"

INPUT_FASTA="data/${DATABASE}/sequences.fasta"
INPUT_FASTA="data/${DATABASE}/sequences.fasta.xz"
OUTPUT_TSV="data/${DATABASE}/nextclade.tsv"
TMP_DIR_FASTA="tmp/${DATABASE}/fasta"
TMP_DIR_TSV="tmp/${DATABASE}/clades"
Expand Down Expand Up @@ -135,7 +135,7 @@ main() {
fi

echo "[ INFO] ${0}:${LINENO}: Downloading '${S3_SRC}/sequences.fasta.xz' to '${INPUT_FASTA}'"
aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" - | xz -T0 -cdfq >"${INPUT_FASTA}"
aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" "${INPUT_FASTA}"

echo "[ INFO] ${0}:${LINENO}: Splitting '${INPUT_FASTA}' into batches of size ${BATCH_SIZE} sequences and storing them in '${INPUT_WILDCARD}'"
# Split fasta file to multiple batches
Expand Down
4 changes: 3 additions & 1 deletion bin/split-fasta
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import os
import lzma

from Bio import SeqIO

Expand Down Expand Up @@ -56,7 +57,8 @@ def main():
input_filename = os.path.basename(args.input_file)
batch_size = int(args.batch_size)

with open(args.input_file) as f_input:
# Must be in "rt" mode since SeqIO requires FASTA files to be opened in text mode
with lzma.open(args.input_file, "rt") as f_input:
record_iter = SeqIO.parse(f_input, file_format)
for i, batch in enumerate(batch_iterator(record_iter, batch_size)):
filename = os.path.join(args.output_dir, f"{input_filename}.batch-{i:05}.fasta")
Expand Down

0 comments on commit 934a2b4

Please sign in to comment.