Skip to content

Commit

Permalink
Merge pull request #57 from nf-core/malt-faa-support
Browse files Browse the repository at this point in the history
Add MALT amino acid support
  • Loading branch information
alxndrdiaz authored Jan 13, 2025
2 parents 5b5fcda + dc76850 commit b932606
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 45 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
- "singularity"
test_name:
- "test"
- "test_alternatives"
isMaster:
- ${{ github.base_ref == 'master' }}
# Exclude conda and singularity on dev
Expand Down
56 changes: 56 additions & 0 deletions conf/test_alternatives.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/createtaxdb -profile test_alternatives,<docker/singularity> --outdir <OUTDIR>
This config is for testing mutually exclusive options to those in the default `test`
----------------------------------------------------------------------------------------
*/

process {
resourceLimits = [
cpus: 4,
memory: '15.GB',
time: '1.h',
]
}

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Input data
input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv'

dbname = "database"

build_bracken = false
build_diamond = false
build_ganon = false
build_kaiju = false
build_malt = true
build_centrifuge = false
build_kraken2 = false
build_krakenuniq = false

krakenuniq_build_params = "--work-on-disk --max-db-size 14 --kmer-len 15 --minimizer-len 13 --jellyfish-bin \"\$(which jellyfish)\""
malt_build_params = "--sequenceType Protein"

accession2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl_gb.accession2taxid'
nucl2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl2tax.map'
prot2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/prot.accession2taxid.gz'
nodesdmp = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nodes.dmp'
namesdmp = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/names.dmp'
malt_mapdb = 's3://ngi-igenomes/test-data/createtaxdb/taxonomy/megan-nucl-Feb2022.db.zip'
}

process {
withName: KRAKENUNIQ_BUILD {
memory = { 12.GB * task.attempt }
}
}
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ profiles {
test_nothing {
includeConfig 'conf/test_nothing.config'
}
test_alternatives {
includeConfig 'conf/test_alternatives.config'
}
}

// Load nf-core custom profiles from different Institutions
Expand Down
10 changes: 5 additions & 5 deletions tests/test.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
],
"database.dmnd:md5,b2ea49ef5490c526e2c56cae19bcb462",
"database.hibf:md5,af913cecda744b02751e2f5320c35c7c",
"database.tax:md5,e041b05ce29813656f529560dc8a19ae",
"database.tax:md5,85d15469eb6fd11ed2a60890cfdeae82",
"database.fmi:md5,54fd89f5e4eab61af30175e8aa389598",
"hash.k2d:md5,941118164b4bcc010593f7a7c7b30029",
"opts.k2d",
Expand All @@ -31,9 +31,9 @@
"taxonomy.tre:md5,f76fb2d5aa9b0d637234d48175841e0e"
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.3"
"nf-test": "0.9.0",
"nextflow": "24.10.2"
},
"timestamp": "2024-12-19T11:45:30.380109094"
"timestamp": "2025-01-09T16:42:06.335852855"
}
}
}
38 changes: 38 additions & 0 deletions tests/test_alternatives.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
nextflow_pipeline {

name "Test pipeline: NFCORE_CREATETAXDB"
script "main.nf"
tag "pipeline"
tag "nfcore_createtaxdb"
tag "test_alternatives"

test("test_alternatives_profile") {

when {
params {
outdir = "$outputDir"
}
}

then {
assertAll(
{ assert workflow.success },
{ assert snapshot(
path("$outputDir/malt/malt-build.log").readLines().last().contains('Peak memory'),
path("$outputDir/malt/malt_index/index0.idx"),
path("$outputDir/malt/malt_index/ref.db"),
path("$outputDir/malt/malt_index/ref.idx"),
path("$outputDir/malt/malt_index/ref.inf"),
path("$outputDir/malt/malt_index/taxonomy.idx"),
path("$outputDir/malt/malt_index/taxonomy.map"),
path("$outputDir/malt/malt_index/taxonomy.tre")
).match()
},
{ assert new File("$outputDir/pipeline_info/nf_core_createtaxdb_software_mqc_versions.yml").exists() },
{ assert new File("$outputDir/multiqc/multiqc_report.html").exists() },
{ assert path("$outputDir/malt/malt_index/table0.db").exists() },
{ assert path("$outputDir/malt/malt_index/table0.idx").exists() },
)
}
}
}
19 changes: 19 additions & 0 deletions tests/test_alternatives.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"test_alternatives_profile": {
"content": [
true,
"index0.idx:md5,ef349ebc456d6446b75570b8a0e57c0d",
"ref.db:md5,9372b49e5990fb4b29ed986a1fc737c4",
"ref.idx:md5,b99c3dc612948a7e4e490c3c74498c80",
"ref.inf:md5,e7ae0e08f730a851abba085e6cc7b4ec",
"taxonomy.idx:md5,f6c520613e1154909658f976498dd4a8",
"taxonomy.map:md5,5bb3f2192e925bca2e61e4b54f1671e0",
"taxonomy.tre:md5,f76fb2d5aa9b0d637234d48175841e0e"
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.10.2"
},
"timestamp": "2025-01-09T16:49:09.883808608"
}
}
81 changes: 41 additions & 40 deletions workflows/createtaxdb.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,28 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline'
include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline'

// Preprocessing
include { GUNZIP as GUNZIP_DNA } from '../modules/nf-core/gunzip/main'
include { PIGZ_COMPRESS as PIGZ_COMPRESS_DNA } from '../modules/nf-core/pigz/compress/main'
include { PIGZ_COMPRESS as PIGZ_COMPRESS_AA } from '../modules/nf-core/pigz/compress/main'
include { CAT_CAT as CAT_CAT_DNA } from '../modules/nf-core/cat/cat/main'
include { CAT_CAT as CAT_CAT_AA } from '../modules/nf-core/cat/cat/main'
include { GUNZIP as GUNZIP_DNA } from '../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_AA } from '../modules/nf-core/gunzip/main'
include { CAT_CAT as CAT_CAT_DNA } from '../modules/nf-core/cat/cat/main'
include { CAT_CAT as CAT_CAT_AA } from '../modules/nf-core/cat/cat/main'

// Database building (with specific auxiliary modules)
include { CENTRIFUGE_BUILD } from '../modules/nf-core/centrifuge/build/main'
include { DIAMOND_MAKEDB } from '../modules/nf-core/diamond/makedb/main'
include { GANON_BUILDCUSTOM } from '../modules/nf-core/ganon/buildcustom/main'
include { KAIJU_MKFMI } from '../modules/nf-core/kaiju/mkfmi/main'
include { KRAKENUNIQ_BUILD } from '../modules/nf-core/krakenuniq/build/main'
include { UNZIP } from '../modules/nf-core/unzip/main'
include { MALT_BUILD } from '../modules/nf-core/malt/build/main'
include { CENTRIFUGE_BUILD } from '../modules/nf-core/centrifuge/build/main'
include { DIAMOND_MAKEDB } from '../modules/nf-core/diamond/makedb/main'
include { GANON_BUILDCUSTOM } from '../modules/nf-core/ganon/buildcustom/main'
include { KAIJU_MKFMI } from '../modules/nf-core/kaiju/mkfmi/main'
include { KRAKENUNIQ_BUILD } from '../modules/nf-core/krakenuniq/build/main'
include { UNZIP } from '../modules/nf-core/unzip/main'
include { MALT_BUILD } from '../modules/nf-core/malt/build/main'

include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main'
include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -54,10 +53,13 @@ workflow CREATETAXDB {
DATA PREPARATION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

// PREPARE: Prepare input for single file inputs modules
def malt_build_mode = null
if (params.build_malt) {
malt_build_mode = params.malt_build_params.contains('--sequenceType Protein') ? 'protein' : 'nucleotide'
}

if ([params.build_malt, params.build_centrifuge, params.build_kraken2, params.build_bracken, params.build_krakenuniq, params.build_ganon].any()) {
if ([(params.build_malt && malt_build_mode == 'nucleotide'), params.build_centrifuge, params.build_kraken2, params.build_bracken, params.build_krakenuniq, params.build_ganon].any()) {
// Pull just DNA sequences

ch_dna_refs_for_singleref = ch_samplesheet
Expand All @@ -66,14 +68,14 @@ workflow CREATETAXDB {
fasta_dna
}

ch_dna_for_unzipping = ch_dna_refs_for_singleref.branch { meta, fasta ->
ch_dna_for_unzipping = ch_dna_refs_for_singleref.branch { _meta, fasta ->
zipped: fasta.extension == 'gz'
unzipped: true
}

GUNZIP_DNA(ch_dna_for_unzipping.zipped)
ch_prepped_dna_fastas_ungrouped = GUNZIP_DNA.out.gunzip.mix(ch_dna_for_unzipping.unzipped)
ch_prepped_dna_fastas = ch_prepped_dna_fastas_ungrouped.map { meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_prepped_dna_fastas = ch_prepped_dna_fastas_ungrouped.map { _meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_versions = ch_versions.mix(GUNZIP_DNA.out.versions.first())

// Place in single file
Expand All @@ -87,22 +89,23 @@ workflow CREATETAXDB {
// docs: https://github.com/bioinformatics-centre/kaiju#custom-database
// docs: https://github.com/nf-core/test-datasets/tree/taxprofiler#kaiju
// idea: try just appending `_<tax_id_from_meta>` to end of each sequence header using a local sed module... it might be sufficient
if ([params.build_kaiju, params.build_diamond].any()) {
if ([(params.build_malt && malt_build_mode == 'protein'), params.build_kaiju, params.build_diamond].any()) {

ch_aa_refs_for_singleref = ch_samplesheet
.map { meta, fasta_dna, fasta_aa -> [[id: params.dbname], fasta_aa] }
.filter { meta, fasta_aa ->
.map { _meta, _fasta_dna, fasta_aa -> [[id: params.dbname], fasta_aa] }
.filter { _meta, fasta_aa ->
fasta_aa
}

ch_aa_for_zipping = ch_aa_refs_for_singleref.branch { meta, fasta ->
ch_aa_for_unzipping = ch_aa_refs_for_singleref.branch { _meta, fasta ->
zipped: fasta.extension == 'gz'
unzipped: true
}

PIGZ_COMPRESS_AA(ch_aa_for_zipping.unzipped)
ch_prepped_aa_fastas = PIGZ_COMPRESS_AA.out.archive.mix(ch_aa_for_zipping.zipped).groupTuple()
//ch_versions = ch_versions.mix( PIGZ_COMPRESS_AA.versions.first() )
GUNZIP_AA(ch_aa_for_unzipping.zipped)
ch_prepped_aa_fastas_ungrouped = GUNZIP_AA.out.gunzip.mix(ch_aa_for_unzipping.unzipped)
ch_prepped_aa_fastas = ch_prepped_aa_fastas_ungrouped.map { _meta, fasta -> [[id: params.dbname], fasta] }.groupTuple()
ch_versions = ch_versions.mix(GUNZIP_AA.out.versions.first())

CAT_CAT_AA(ch_prepped_aa_fastas)
ch_singleref_for_aa = CAT_CAT_AA.out.file_out
Expand Down Expand Up @@ -148,7 +151,7 @@ workflow CREATETAXDB {
.map { it.join("\t") }
.collectFile(
name: "ganon_fasta_input.tsv",
newLine: true
newLine: true,
)
.map {
[[id: params.dbname], it]
Expand Down Expand Up @@ -209,17 +212,17 @@ workflow CREATETAXDB {

// The map DB file comes zipped (for some reason) from MEGAN6 website
if (file(params.malt_mapdb).extension == 'zip') {
ch_malt_mapdb = UNZIP([[], params.malt_mapdb]).unzipped_archive.map { meta, file -> [file] }
ch_malt_mapdb = UNZIP([[], params.malt_mapdb]).unzipped_archive.map { _meta, file -> [file] }
}
else {
ch_malt_mapdb = file(params.malt_mapdb)
}

if (params.malt_build_params.contains('--sequenceType Protein')) {
ch_input_for_malt = ch_prepped_aa_fastas.map { meta, file -> file }
if (malt_build_mode == 'protein') {
ch_input_for_malt = ch_prepped_aa_fastas.map { _meta, file -> file }
}
else {
ch_input_for_malt = ch_prepped_dna_fastas.map { meta, file -> file }
ch_input_for_malt = ch_prepped_dna_fastas.map { _meta, file -> file }
}

MALT_BUILD(ch_input_for_malt, [], ch_malt_mapdb)
Expand All @@ -236,11 +239,9 @@ workflow CREATETAXDB {
softwareVersionsToYAML(ch_versions)
.collectFile(
storeDir: "${params.outdir}/pipeline_info",

name: 'nf_core_' + 'createtaxdb_software_' + 'mqc_' + 'versions.yml',

name: 'nf_core_' + 'createtaxdb_software_' + 'mqc_' + 'versions.yml',
sort: true,
newLine: true
newLine: true,
)
.set { ch_collated_versions }

Expand Down Expand Up @@ -278,7 +279,7 @@ workflow CREATETAXDB {
ch_multiqc_files = ch_multiqc_files.mix(
ch_methods_description.collectFile(
name: 'methods_description_mqc.yaml',
sort: true
sort: true,
)
)

Expand All @@ -288,7 +289,7 @@ workflow CREATETAXDB {
ch_multiqc_custom_config.toList(),
ch_multiqc_logo.toList(),
[],
[]
[],
)

emit:
Expand Down

0 comments on commit b932606

Please sign in to comment.