Skip to content

Commit

Permalink
transform-gisaid: use lzma to read .xz file
Browse files Browse the repository at this point in the history
Use the `lzma` module to read the compressed `gisaid.ndjson.xz` file so
that we do not have to keep the uncompressed file on disk.
  • Loading branch information
joverlee521 committed Oct 19, 2021
1 parent 10d3e25 commit 062f2f4
Showing 1 changed file with 13 additions and 12 deletions.
25 changes: 13 additions & 12 deletions bin/transform-gisaid
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import os
import argparse
import csv
import sys
import lzma
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / "lib"))
Expand Down Expand Up @@ -53,7 +54,7 @@ if __name__ == '__main__':
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument("gisaid_data",
default="s3://nextstrain-ncov-private/gisaid.ndjson.gz",
default="s3://nextstrain-ncov-private/gisaid.ndjson.xz",
help="Newline-delimited GISAID JSON data")
parser.add_argument("--annotations",
default=str( base / "source-data/gisaid_annotations.tsv" ),
Expand Down Expand Up @@ -81,7 +82,7 @@ if __name__ == '__main__':
"e.g.\n\t"
"Europe/Spain/Catalunya/Mataró\tEurope/Spain/Catalunya/Mataro\n\t")
parser.add_argument("--output-metadata",
default=str( base / "data/gisaid/metadata.tsv" ),
default=str( base / "data/gisaid/metadata.tsv" ),
help="Output location of generated metadata tsv. Defaults to `data/gisaid/metadata.tsv`")
parser.add_argument("--output-fasta",
default=str( base / "data/gisaid/sequences.fasta" ) ,
Expand Down Expand Up @@ -142,7 +143,7 @@ if __name__ == '__main__':
RAW_METADATA_FILENAME = args.output_metadata + '.raw'


with open(args.gisaid_data, "r") as gisaid_fh :
with lzma.open(args.gisaid_data, "r") as gisaid_fh :

pipeline = (
LineToJsonDataSource(gisaid_fh)
Expand All @@ -153,30 +154,30 @@ if __name__ == '__main__':

if not args.sorted_fasta:
pipeline = pipeline | DropSequenceData()

pipeline = (
pipeline
| ExpandLocation()
| FixLabs()
| AbbreviateAuthors()
| ParsePatientAge()
| ParseSex()
| AddHardcodedMetadata()
| AddHardcodedMetadata()
)

# writing the raw metadata in a tsv file
pipeline = ( pipeline | WriteCSV(RAW_METADATA_FILENAME,
METADATA_COLUMNS ,
restval = '?' ,
extrasaction ='ignore' ,
delimiter = '\t',
pipeline = ( pipeline | WriteCSV(RAW_METADATA_FILENAME,
METADATA_COLUMNS ,
restval = '?' ,
extrasaction ='ignore' ,
delimiter = '\t',
dict_writer_kwargs = {'lineterminator': args.newline} ) )


# applying the substitution rules (temporary : writing the intermediary data to verify effect )
dict_writer_kwargs = {'lineterminator': args.newline}


pipeline = (pipeline
| ApplyUserGeoLocationSubstitutionRules(geoRules)
| MergeUserAnnotatedMetadata(annotations)
Expand Down Expand Up @@ -245,7 +246,7 @@ if __name__ == '__main__':
updated_strain_names_by_line_no[entry[LINE_NUMBER_KEY]] = entry['strain']

if not args.sorted_fasta:
with open(args.gisaid_data, "r") as gisaid_fh:
with lzma.open(args.gisaid_data, "r") as gisaid_fh:
for entry in (
LineToJsonDataSource(gisaid_fh)
| RenameAndAddColumns()
Expand Down

0 comments on commit 062f2f4

Please sign in to comment.