diff --git a/bin/generate-nextclade-version-json b/bin/generate-nextclade-version-json new file mode 100755 index 00000000..899aee83 --- /dev/null +++ b/bin/generate-nextclade-version-json @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +vendored="$(dirname "$0")"/../vendored + + +nextclade="${1:?A path to the Nextclade executable is required as the first argument}" +nextclade_dataset="${2:?A path to the Nextclade dataset is required as the second argument}" +nextclade_tsv="${3:?A path to the Nextclade TSV is required as the third argument}" + + +nextclade_version="$("$nextclade" --version)" +dataset_pathogen_json="$(unzip -p "$nextclade_dataset" pathogen.json)" +dataset_name="$(echo "$dataset_pathogen_json" | jq -r '.attributes.name')" +dataset_version="$(echo "$dataset_pathogen_json" | jq -r '.version.tag')" +nextclade_tsv_sha256sum="$("$vendored/sha256sum" < "$nextclade_tsv")" + +jq -c --null-input \ + --arg NEXTCLADE_VERSION "$nextclade_version" \ + --arg DATASET_NAME "$dataset_name" \ + --arg DATASET_VERSION "$dataset_version" \ + --arg NEXTCLADE_TSV_SHA256SUM "$nextclade_tsv_sha256sum" \ + '{ + "schema_version": "v1", + "nextclade_version": $NEXTCLADE_VERSION, + "nextclade_dataset_name": $DATASET_NAME, + "nextclade_dataset_version": $DATASET_VERSION, + "nextclade_tsv_sha256sum": $NEXTCLADE_TSV_SHA256SUM + }' diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index ed3d5c46..1fe637d8 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -316,6 +316,26 @@ rule nextclade_info: """ +rule nextclade_version_json: + """ + Generates a version JSON for the Nextclade TSV. + """ + input: + nextclade_path="data/nextclade", + nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip", + nextclade_tsv=f"data/{database}/nextclade{{reference}}.tsv", + output: + nextclade_version_json=f"data/{database}/nextclade{{reference}}_version.json", + shell: + """ + ./bin/generate-nextclade-version-json \ + {input.nextclade_path} \ + {input.nextclade_dataset} \ + {input.nextclade_tsv} \ + > {output.nextclade_version_json} + """ + + rule combine_alignments: """ Generating full alignment by combining newly aligned sequences with previous (cached) alignment @@ -365,3 +385,27 @@ rule generate_metadata: --clade-legacy-mapping {input.clade_legacy_mapping} \ -o {output.metadata} """ + + +rule metadata_version_json: + """ + Generates the metadata version JSON by adding the metadata TSV sha256sum + to the Nextclade version JSON. + + TODO: Merge the 21L Nextclade version JSON to track data provenence for + specific columns + """ + input: + metadata=f"data/{database}/metadata.tsv", + nextclade_version_json=f"data/{database}/nextclade_version.json", + output: + metadata_version_json=f"data/{database}/metadata_version.json", + shell: + """ + metadata_tsv_sha256sum="$(./vendored/sha256sum < {input.metadata})" + + cat {input.nextclade_version_json} \ + | jq -c --arg METADATA_TSV_SHA256SUM "$metadata_tsv_sha256sum" \ + '.metadata_tsv_sha256sum = $METADATA_TSV_SHA256SUM' \ + > {output.metadata_version_json} + """ diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 2b8a02bb..b6a08985 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -33,6 +33,9 @@ def compute_files_to_upload(): "aligned.fasta.zst": f"data/{database}/aligned.fasta", "nextclade_21L.tsv.zst": f"data/{database}/nextclade_21L.tsv", + "nextclade_version.json": f"data/{database}/nextclade_version.json", + "nextclade_21L_version.json": f"data/{database}/nextclade_21L_version.json", + "metadata_version.json": f"data/{database}/metadata_version.json", } files_to_upload = files_to_upload | { f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta"