From 062f2f48e5668fc78d2c6cb1cfd0d2ecc38e8444 Mon Sep 17 00:00:00 2001 From: Jover Date: Mon, 18 Oct 2021 15:53:37 -0700 Subject: [PATCH] transform-gisaid: use `lzma` to read `.xz` file Use the `lzma` module to read the compressed `gisaid.ndjson.xz` file so that we do not have to keep the uncompressed file on disk. --- bin/transform-gisaid | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/bin/transform-gisaid b/bin/transform-gisaid index 0beb0df38..18ddf6a99 100755 --- a/bin/transform-gisaid +++ b/bin/transform-gisaid @@ -6,6 +6,7 @@ import os import argparse import csv import sys +import lzma from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent / "lib")) @@ -53,7 +54,7 @@ if __name__ == '__main__': formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument("gisaid_data", - default="s3://nextstrain-ncov-private/gisaid.ndjson.gz", + default="s3://nextstrain-ncov-private/gisaid.ndjson.xz", help="Newline-delimited GISAID JSON data") parser.add_argument("--annotations", default=str( base / "source-data/gisaid_annotations.tsv" ), @@ -81,7 +82,7 @@ if __name__ == '__main__': "e.g.\n\t" "Europe/Spain/Catalunya/MatarĂ³\tEurope/Spain/Catalunya/Mataro\n\t") parser.add_argument("--output-metadata", - default=str( base / "data/gisaid/metadata.tsv" ), + default=str( base / "data/gisaid/metadata.tsv" ), help="Output location of generated metadata tsv. Defaults to `data/gisaid/metadata.tsv`") parser.add_argument("--output-fasta", default=str( base / "data/gisaid/sequences.fasta" ) , @@ -142,7 +143,7 @@ if __name__ == '__main__': RAW_METADATA_FILENAME = args.output_metadata + '.raw' - with open(args.gisaid_data, "r") as gisaid_fh : + with lzma.open(args.gisaid_data, "r") as gisaid_fh : pipeline = ( LineToJsonDataSource(gisaid_fh) @@ -153,7 +154,7 @@ if __name__ == '__main__': if not args.sorted_fasta: pipeline = pipeline | DropSequenceData() - + pipeline = ( pipeline | ExpandLocation() @@ -161,22 +162,22 @@ if __name__ == '__main__': | AbbreviateAuthors() | ParsePatientAge() | ParseSex() - | AddHardcodedMetadata() + | AddHardcodedMetadata() ) # writing the raw metadata in a tsv file - pipeline = ( pipeline | WriteCSV(RAW_METADATA_FILENAME, - METADATA_COLUMNS , - restval = '?' , - extrasaction ='ignore' , - delimiter = '\t', + pipeline = ( pipeline | WriteCSV(RAW_METADATA_FILENAME, + METADATA_COLUMNS , + restval = '?' , + extrasaction ='ignore' , + delimiter = '\t', dict_writer_kwargs = {'lineterminator': args.newline} ) ) # applying the substitution rules (temporary : writing the intermediary data to verify effect ) dict_writer_kwargs = {'lineterminator': args.newline} - + pipeline = (pipeline | ApplyUserGeoLocationSubstitutionRules(geoRules) | MergeUserAnnotatedMetadata(annotations) @@ -245,7 +246,7 @@ if __name__ == '__main__': updated_strain_names_by_line_no[entry[LINE_NUMBER_KEY]] = entry['strain'] if not args.sorted_fasta: - with open(args.gisaid_data, "r") as gisaid_fh: + with lzma.open(args.gisaid_data, "r") as gisaid_fh: for entry in ( LineToJsonDataSource(gisaid_fh) | RenameAndAddColumns()