Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MALT amino acid support #57

Merged
merged 4 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
- "singularity"
test_name:
- "test"
- "test_alternatives"
isMaster:
- ${{ github.base_ref == 'master' }}
# Exclude conda and singularity on dev
Expand Down
56 changes: 56 additions & 0 deletions conf/test_alternatives.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/createtaxdb -profile test_alternatives,<docker/singularity> --outdir <OUTDIR>

This config is for testing mutually exclusive options to those in the default `test`

----------------------------------------------------------------------------------------
*/

process {
resourceLimits = [
cpus: 4,
memory: '15.GB',
time: '1.h',
]
}

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Input data
input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv'

dbname = "database"

build_bracken = false
build_diamond = false
build_ganon = false
build_kaiju = false
build_malt = true
build_centrifuge = false
build_kraken2 = false
build_krakenuniq = false

krakenuniq_build_params = "--work-on-disk --max-db-size 14 --kmer-len 15 --minimizer-len 13 --jellyfish-bin \"\$(which jellyfish)\""
malt_build_params = "--sequenceType Protein"

accession2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl_gb.accession2taxid'
nucl2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl2tax.map'
prot2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/prot.accession2taxid.gz'
nodesdmp = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nodes.dmp'
namesdmp = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/names.dmp'
malt_mapdb = 's3://ngi-igenomes/test-data/createtaxdb/taxonomy/megan-nucl-Feb2022.db.zip'
}

process {
withName: KRAKENUNIQ_BUILD {
memory = { 12.GB * task.attempt }
}
}
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ profiles {
test_nothing {
includeConfig 'conf/test_nothing.config'
}
test_alternatives {
includeConfig 'conf/test_alternatives.config'
}
}

// Load nf-core custom profiles from different Institutions
Expand Down
10 changes: 5 additions & 5 deletions tests/test.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
],
"database.dmnd:md5,b2ea49ef5490c526e2c56cae19bcb462",
"database.hibf:md5,af913cecda744b02751e2f5320c35c7c",
"database.tax:md5,e041b05ce29813656f529560dc8a19ae",
"database.tax:md5,85d15469eb6fd11ed2a60890cfdeae82",
"database.fmi:md5,54fd89f5e4eab61af30175e8aa389598",
"hash.k2d:md5,941118164b4bcc010593f7a7c7b30029",
"opts.k2d",
Expand All @@ -31,9 +31,9 @@
"taxonomy.tre:md5,f76fb2d5aa9b0d637234d48175841e0e"
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.3"
"nf-test": "0.9.0",
"nextflow": "24.10.2"
},
"timestamp": "2024-12-19T11:45:30.380109094"
"timestamp": "2025-01-09T16:42:06.335852855"
}
}
}
38 changes: 38 additions & 0 deletions tests/test_alternatives.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
nextflow_pipeline {

name "Test pipeline: NFCORE_CREATETAXDB"
script "main.nf"
tag "pipeline"
tag "nfcore_createtaxdb"
tag "test_alternatives"

test("test_alternatives_profile") {

when {
params {
outdir = "$outputDir"
}
}

then {
assertAll(
{ assert workflow.success },
{ assert snapshot(
path("$outputDir/malt/malt-build.log").readLines().last().contains('Peak memory'),
path("$outputDir/malt/malt_index/index0.idx"),
path("$outputDir/malt/malt_index/ref.db"),
path("$outputDir/malt/malt_index/ref.idx"),
path("$outputDir/malt/malt_index/ref.inf"),
path("$outputDir/malt/malt_index/taxonomy.idx"),
path("$outputDir/malt/malt_index/taxonomy.map"),
path("$outputDir/malt/malt_index/taxonomy.tre")
).match()
},
{ assert new File("$outputDir/pipeline_info/nf_core_createtaxdb_software_mqc_versions.yml").exists() },
{ assert new File("$outputDir/multiqc/multiqc_report.html").exists() },
{ assert path("$outputDir/malt/malt_index/table0.db").exists() },
{ assert path("$outputDir/malt/malt_index/table0.idx").exists() },
)
}
}
}
19 changes: 19 additions & 0 deletions tests/test_alternatives.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"test_alternatives_profile": {
"content": [
true,
"index0.idx:md5,ef349ebc456d6446b75570b8a0e57c0d",
"ref.db:md5,9372b49e5990fb4b29ed986a1fc737c4",
"ref.idx:md5,b99c3dc612948a7e4e490c3c74498c80",
"ref.inf:md5,e7ae0e08f730a851abba085e6cc7b4ec",
"taxonomy.idx:md5,f6c520613e1154909658f976498dd4a8",
"taxonomy.map:md5,5bb3f2192e925bca2e61e4b54f1671e0",
"taxonomy.tre:md5,f76fb2d5aa9b0d637234d48175841e0e"
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.10.2"
},
"timestamp": "2025-01-09T16:49:09.883808608"
}
}
81 changes: 41 additions & 40 deletions workflows/createtaxdb.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,28 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline'
include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline'

// Preprocessing
include { GUNZIP as GUNZIP_DNA } from '../modules/nf-core/gunzip/main'
include { PIGZ_COMPRESS as PIGZ_COMPRESS_DNA } from '../modules/nf-core/pigz/compress/main'
include { PIGZ_COMPRESS as PIGZ_COMPRESS_AA } from '../modules/nf-core/pigz/compress/main'
include { CAT_CAT as CAT_CAT_DNA } from '../modules/nf-core/cat/cat/main'
include { CAT_CAT as CAT_CAT_AA } from '../modules/nf-core/cat/cat/main'
include { GUNZIP as GUNZIP_DNA } from '../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_AA } from '../modules/nf-core/gunzip/main'
include { CAT_CAT as CAT_CAT_DNA } from '../modules/nf-core/cat/cat/main'
include { CAT_CAT as CAT_CAT_AA } from '../modules/nf-core/cat/cat/main'

// Database building (with specific auxiliary modules)
include { CENTRIFUGE_BUILD } from '../modules/nf-core/centrifuge/build/main'
include { DIAMOND_MAKEDB } from '../modules/nf-core/diamond/makedb/main'
include { GANON_BUILDCUSTOM } from '../modules/nf-core/ganon/buildcustom/main'
include { KAIJU_MKFMI } from '../modules/nf-core/kaiju/mkfmi/main'
include { KRAKENUNIQ_BUILD } from '../modules/nf-core/krakenuniq/build/main'
include { UNZIP } from '../modules/nf-core/unzip/main'
include { MALT_BUILD } from '../modules/nf-core/malt/build/main'
include { CENTRIFUGE_BUILD } from '../modules/nf-core/centrifuge/build/main'
include { DIAMOND_MAKEDB } from '../modules/nf-core/diamond/makedb/main'
include { GANON_BUILDCUSTOM } from '../modules/nf-core/ganon/buildcustom/main'
include { KAIJU_MKFMI } from '../modules/nf-core/kaiju/mkfmi/main'
include { KRAKENUNIQ_BUILD } from '../modules/nf-core/krakenuniq/build/main'
include { UNZIP } from '../modules/nf-core/unzip/main'
include { MALT_BUILD } from '../modules/nf-core/malt/build/main'

include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main'
include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -54,10 +53,13 @@ workflow CREATETAXDB {
DATA PREPARATION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

// PREPARE: Prepare input for single file inputs modules
def malt_build_mode = null
if (params.build_malt) {
malt_build_mode = params.malt_build_params.contains('--sequenceType Protein') ? 'protein' : 'nucleotide'
}

if ([params.build_malt, params.build_centrifuge, params.build_kraken2, params.build_bracken, params.build_krakenuniq, params.build_ganon].any()) {
if ([(params.build_malt && malt_build_mode == 'nucleotide'), params.build_centrifuge, params.build_kraken2, params.build_bracken, params.build_krakenuniq, params.build_ganon].any()) {
// Pull just DNA sequences

ch_dna_refs_for_singleref = ch_samplesheet
Expand All @@ -66,14 +68,14 @@ workflow CREATETAXDB {
fasta_dna
}

ch_dna_for_unzipping = ch_dna_refs_for_singleref.branch { meta, fasta ->
ch_dna_for_unzipping = ch_dna_refs_for_singleref.branch { _meta, fasta ->
zipped: fasta.extension == 'gz'
unzipped: true
}

GUNZIP_DNA(ch_dna_for_unzipping.zipped)
ch_prepped_dna_fastas_ungrouped = GUNZIP_DNA.out.gunzip.mix(ch_dna_for_unzipping.unzipped)
ch_prepped_dna_fastas = ch_prepped_dna_fastas_ungrouped.map { meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_prepped_dna_fastas = ch_prepped_dna_fastas_ungrouped.map { _meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_versions = ch_versions.mix(GUNZIP_DNA.out.versions.first())

// Place in single file
Expand All @@ -87,22 +89,23 @@ workflow CREATETAXDB {
// docs: https://github.com/bioinformatics-centre/kaiju#custom-database
// docs: https://github.com/nf-core/test-datasets/tree/taxprofiler#kaiju
// idea: try just appending `_<tax_id_from_meta>` to end of each sequence header using a local sed module... it might be sufficient
if ([params.build_kaiju, params.build_diamond].any()) {
if ([(params.build_malt && malt_build_mode == 'protein'), params.build_kaiju, params.build_diamond].any()) {

ch_aa_refs_for_singleref = ch_samplesheet
.map { meta, fasta_dna, fasta_aa -> [[id: params.dbname], fasta_aa] }
.filter { meta, fasta_aa ->
.map { _meta, _fasta_dna, fasta_aa -> [[id: params.dbname], fasta_aa] }
.filter { _meta, fasta_aa ->
fasta_aa
}

ch_aa_for_zipping = ch_aa_refs_for_singleref.branch { meta, fasta ->
ch_aa_for_unzipping = ch_aa_refs_for_singleref.branch { _meta, fasta ->
zipped: fasta.extension == 'gz'
unzipped: true
}

PIGZ_COMPRESS_AA(ch_aa_for_zipping.unzipped)
ch_prepped_aa_fastas = PIGZ_COMPRESS_AA.out.archive.mix(ch_aa_for_zipping.zipped).groupTuple()
//ch_versions = ch_versions.mix( PIGZ_COMPRESS_AA.versions.first() )
GUNZIP_AA(ch_aa_for_unzipping.zipped)
ch_prepped_aa_fastas_ungrouped = GUNZIP_AA.out.gunzip.mix(ch_aa_for_unzipping.unzipped)
ch_prepped_aa_fastas = ch_prepped_aa_fastas_ungrouped.map { _meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_versions = ch_versions.mix(GUNZIP_AA.out.versions.first())

CAT_CAT_AA(ch_prepped_aa_fastas)
ch_singleref_for_aa = CAT_CAT_AA.out.file_out
Expand Down Expand Up @@ -148,7 +151,7 @@ workflow CREATETAXDB {
.map { it.join("\t") }
.collectFile(
name: "ganon_fasta_input.tsv",
newLine: true
newLine: true,
)
.map {
[[id: params.dbname], it]
Expand Down Expand Up @@ -209,17 +212,17 @@ workflow CREATETAXDB {

// The map DB file comes zipped (for some reason) from MEGAN6 website
if (file(params.malt_mapdb).extension == 'zip') {
ch_malt_mapdb = UNZIP([[], params.malt_mapdb]).unzipped_archive.map { meta, file -> [file] }
ch_malt_mapdb = UNZIP([[], params.malt_mapdb]).unzipped_archive.map { _meta, file -> [file] }
}
else {
ch_malt_mapdb = file(params.malt_mapdb)
}

if (params.malt_build_params.contains('--sequenceType Protein')) {
ch_input_for_malt = ch_prepped_aa_fastas.map { meta, file -> file }
if (malt_build_mode == 'protein') {
ch_input_for_malt = ch_prepped_aa_fastas.map { _meta, file -> file }
}
else {
ch_input_for_malt = ch_prepped_dna_fastas.map { meta, file -> file }
ch_input_for_malt = ch_prepped_dna_fastas.map { _meta, file -> file }
}

MALT_BUILD(ch_input_for_malt, [], ch_malt_mapdb)
Expand All @@ -236,11 +239,9 @@ workflow CREATETAXDB {
softwareVersionsToYAML(ch_versions)
.collectFile(
storeDir: "${params.outdir}/pipeline_info",

name: 'nf_core_' + 'createtaxdb_software_' + 'mqc_' + 'versions.yml',

name: 'nf_core_' + 'createtaxdb_software_' + 'mqc_' + 'versions.yml',
sort: true,
newLine: true
newLine: true,
)
.set { ch_collated_versions }

Expand Down Expand Up @@ -278,7 +279,7 @@ workflow CREATETAXDB {
ch_multiqc_files = ch_multiqc_files.mix(
ch_methods_description.collectFile(
name: 'methods_description_mqc.yaml',
sort: true
sort: true,
)
)

Expand All @@ -288,7 +289,7 @@ workflow CREATETAXDB {
ch_multiqc_custom_config.toList(),
ch_multiqc_logo.toList(),
[],
[]
[],
)

emit:
Expand Down
Loading