Merge pull request #14 from GallVp/feat/nfcore

Added input schema and config from the nf-core template
oushujun · Dec 18, 2024 · 54210ac · 54210ac
2 parents c0c34c2 + c509827
commit 54210ac
Show file tree

Hide file tree

Showing 40 changed files with 2,037 additions and 213 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -9,12 +9,6 @@ These are the most common things requested on pull requests (PRs).
 
 ## PR checklist
 
-For Nextflow implementation,
-
-- [ ] `conda` and `container` directives are included for each process
-- [ ] Docker container + singularity container (optional) are included for each process
-- [ ] Flow `meta.id` with each data channel
-- [ ] Use nf-core resource labels such as `process_high`
-- [ ] Used nf-core module
-- [ ] Use `versions.yml` or versions topic
-- [ ] No process in the `main.nf`. We can have a process in a sub-workflow file
+- [ ] This comment contains a description of changes (with reason).
+- [ ] If you've fixed a bug or added code that should be tested, add tests!
+- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir <OUTDIR>`).
diff --git a/.github/version_checks.sh b/.github/version_checks.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+config_version=$(sed -n "/^\s*version\s*=\s*'/s/version//p" nextflow.config | tr -d "=[:space:]'")
+perl_version=$(sed -n 's|^my $version = "\(.*\)";|\1|p' EDTA.pl | tr -d '[:space:]')
+
+if [[ "v$config_version" != $perl_version ]]; then
+    echo "config_version (v$config_version) != perl_version ($perl_version)"
+    exit 1
+fi
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -46,7 +46,7 @@ jobs:
       fail-fast: false
       matrix:
         NXF_VER:
-          - "24.04.2"
+          - "24.10.3"
 
         nf_test_files: ["${{ fromJson(needs.nf-test-changes.outputs.nf_test_files) }}"]
         profile: [conda, docker, singularity]
@@ -72,7 +72,7 @@ jobs:
 
       - uses: nf-core/[email protected]
         with:
-          version: 0.9.0
+          version: 0.9.2
 
       - name: Setup apptainer
         if: matrix.profile == 'singularity'

diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ work/
 .nextflow/*
 results/
 .nf-test*
+null/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,3 +9,11 @@ repos:
         always_run: true
         fail_fast: true
         pass_filenames: false
+      - id: version_checks
+        name: Version checks
+        language: system
+        entry: >
+          .github/version_checks.sh
+        always_run: true
+        fail_fast: true
+        pass_filenames: false
diff --git a/README.md b/README.md
@@ -1,29 +1,34 @@
 [![install with bioconda](https://anaconda.org/bioconda/edta/badges/platforms.svg)](https://anaconda.org/bioconda/edta) [![Anaconda-Server Badge](https://anaconda.org/bioconda/edta/badges/license.svg)](https://github.com/oushujun/EDTA/blob/master/LICENSE) [![Anaconda-Server Badge](https://anaconda.org/bioconda/edta/badges/version.svg)](https://anaconda.org/bioconda/edta) [![Anaconda-Server Badge](https://anaconda.org/bioconda/edta/badges/downloads.svg)](https://anaconda.org/bioconda/edta)
 
-
-# The Extensive *de novo* TE Annotator (EDTA)
-
-## Table of Contents
-
-   * [Introduction](#introduction)
-   * [Installation](#installation)
-      * [Quick installation using conda/mamba](#install-with-condamamba-linux64)
-      * [Quick installation using Singularity](#install-with-singularity-good-for-hpc-users)
-      * [Quick installation using Docker](#install-with-docker-good-for-rootmacosapple-m-chip-users)
-   * [Testing](#testing)
-   * [Inputs](#inputs)
-   * [Outputs](#outputs)
-   * [EDTA usage](#edta-usage)
-      * [From head to toe](#from-head-to-toe)
-      * [Divide and conquer](#divide-and-conquer)
-      * [Protips and self-diagnosis](#protips-and-self-diagnosis)
-   * [panEDTA usage](#panedta-usage)
-   * [Benchmark](#benchmark)
-   * [Citations](#citations)
-   * [Other resources](#other-resources)
-   * [Questions and Issues](#questions-and-issues)
-   * [Acknowledgements](#acknowledgements)
-
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.10.3-23aa62.svg)](https://www.nextflow.io/)
+[![run with conda](http://img.shields.io/badge/run%20with-conda%20-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
+[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
+[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
+[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)
+
+# The Extensive *de novo* TE Annotator (EDTA)<!-- omit in toc -->
+
+## Table of Contents<!-- omit in toc -->
+
+- [Introduction](#introduction)
+- [Installation](#installation)
+  - [Install with conda/mamba (Linux64)](#install-with-condamamba-linux64)
+  - [Install with Singularity (good for HPC users)](#install-with-singularity-good-for-hpc-users)
+  - [Install with Docker (good for root/macOS/Apple M-chip users)](#install-with-docker-good-for-rootmacosapple-m-chip-users)
+- [Testing](#testing)
+- [Inputs](#inputs)
+- [Outputs](#outputs)
+- [EDTA Usage](#edta-usage)
+  - [From head to toe](#from-head-to-toe)
+  - [Divide and conquer](#divide-and-conquer)
+  - [Protips and self-diagnosis](#protips-and-self-diagnosis)
+  - [Run with Nextflow](#run-with-nextflow)
+- [panEDTA usage](#panedta-usage)
+- [Benchmark](#benchmark)
+- [Citations](#citations)
+- [Other resources](#other-resources)
+- [Questions and Issues](#questions-and-issues)
+- [Acknowledgements](#acknowledgements)
 
 ## Introduction
 This package is developed for automated whole-genome *de-novo* TE annotation and benchmarking the annotation performance of TE libraries.
@@ -41,7 +46,6 @@ To benchmark the annotation quality of a new library/method, I have provided the
 
 For pan-genome annotations, you need to annotate each genome with EDTA, generate a pan-genome library, then reannotate each genome with the pan-genome library. Please refer to this [example](https://github.com/HuffordLab/NAM-genomes/tree/master/te-annotation) for details. A sequential version of panEDTA is also included in this package. 
 
-
 ## Installation
 
 There are several ways to install EDTA. You just need to find the one that works for your system. If you are not using macOS, you may try the conda approach before the Singularity approach.
@@ -120,8 +124,8 @@ Visit [BioContainers](https://quay.io/repository/biocontainers/edta?tab=tags) re
 
 Note: Because only the current directory is mounted to the EDTA docker container, you have to copy all needed files to the current directory and provide them to EDTA without path specifications. Even providing the absolute path to the file located in this folder won't work. Softlinked files are considered "with path" and won't work. Similarily, specifying your own versions of dependency programs (i.e., repeatmasker, repeatmodeler) won't work because they have paths.
 
-
 ## Testing
+
 You should test the EDTA pipeline with a 1-Mb toy genome, which takes about five mins. If your test finishs without any errors (warnings are OK), then EDTA should be correctly installed. If the test is OK but you encounter errors with your data, you should check your own data for any formating/naming mistakes.
 
 ```
@@ -131,17 +135,17 @@ perl ../EDTA.pl --genome genome.fa --cds genome.cds.fa --curatedlib ../database/
 
 If your test fails, you may check out this [collection of issues](https://github.com/oushujun/EDTA/wiki/Installations,-builds,-and-tests-Q&A) for possible reasons and solutions. If none works, you may open a new issue.
 
-
 ## Inputs
+
 Required: The genome file [FASTA]. Please make sure sequence names are short (<=13 characters) and simple (i.e, letters, numbers, and underscore).
 
 Optional: 
 1. Coding sequence of the species or a closely related species [FASTA]. This file helps to purge gene sequences in the TE library.
 2. Known gene positions of this version of the genome assembly [BED]. Coordinates specified in this file will be excluded from TE annotation to avoid over-masking.
 3. Curated TE library of the species [FASTA]. This file is trusted 100%. Please make sure it's curated. If you only have a couple of curated sequences, that's also good. It doesn't need to be complete. Providing curated TE sequences, especially for those under-annotated TE types (i.e., SINEs and LINEs), will greatly improve the annotation quality. For more information, please visit this wiki page: [How to prepare a curated library to maximize the efficacy of EDTA](https://github.com/oushujun/EDTA/wiki/How-to-prepare-a-curated-library-to-maximize-the-efficacy-of-EDTA)
 
-
 ## Outputs
+
 A non-redundant TE library: $genome.mod.EDTA.TElib.fa. The curated library will be included in this file if provided. The [rice library](./database/rice7.0.0.liban) will be (partially) included if `--force 1` is specified. TEs are classified into the superfamily level and using the three-letter naming system reported in [Wicker et al. (2007)](https://www.nature.com/articles/nrg2165). Each sequence can be considered as a TE family. To convert between classification systems, please refer to the [TE sequence ontology file](./bin/TE_Sequence_Ontology.txt).
 
 Optional 1:
@@ -155,10 +159,10 @@ Optional 2, when you specify the `--anno 1` parameter, you will get:
 6. Annotation inconsistency for nested TEs: $genome.mod.EDTA.TE.fa.stat.nested.sum.   
 7. Oveall annotation inconsistency: $genome.mod.EDTA.TE.fa.stat.all.sum.
 
-
 ## EDTA Usage
 
 ### From head to toe
+
 *You got a genome and you want to get a high-quality TE annotation:*
 
     perl EDTA.pl [options]
@@ -189,6 +193,7 @@ Optional 2, when you specify the `--anno 1` parameter, you will get:
       --help|-h	Display this help info
 
 ### Divide and conquer
+
 *Identify intact elements of a paticular TE type*:
 
 1.Get raw TEs from a genome (specify `-type ltr|tir|helitron` in different runs)
@@ -207,11 +212,35 @@ Optional 2, when you specify the `--anno 1` parameter, you will get:
     perl EDTA.pl --overwrite 0 [options]
 
 ### Protips and self-diagnosis
+
 1. It's never said enough. You should tidy up all your sequence names before ANY analysis. Keep them short, simple, and unique.
 2. Run it in a fast drive (i.e., SSD) because RepeatMasker/RepeatModeler is I/O intense.
 3. Check out the [Wiki page](https://github.com/oushujun/EDTA/wiki) for more information and frequently asked questions.
 
+### Run with [Nextflow](https://www.nextflow.io/)
+
+A limited beta version of the pipeline implemented in Nextflow is also available. The pipeline can be launched with Nextflow without the need to download it. Nextflow takes care of the pipeline download and dependency setup through the chosen execution engine such as Docker, Singularity, Conda, Mamba, Podman, etc.
+
+```bash
+nextflow run oushujun/EDTA \
+  -revision <version> \
+  -profile <docker/singularity/conda/...> \
+  --genome <genome.fasta/genomes.txt> \
+  --outdir <OUTDIR>
+```
+
+Where, `--genome` is either a path to a fasta file or a text file which lists fasta files, and `--outdir` is a path to a directory where the pipeline outputs should be stored.
+
+If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. For example when using Conda for dependency management, make sure Conda is installed and then test the pipeline on the included test data with,
+
+```bash
+nextflow run oushujun/EDTA \
+  -profile conda,test
+  --outdir results
+```
+
 ## panEDTA usage
+
 This is the serial version of panEDTA. Each genome will be annotated sequentially and then combined with the panEDTA functionality. Existing EDTA annotation of genomes (EDTA run with --anno 1) will be recognized and reused. A way to acclerate the pan-genome annotation is to execute EDTA annotation of each genomes separately and in parallel, then execute panEDTA to finish the remaining of the runs. You may want to save the GFF files and the sum file of the EDTA results of each genome because they will be overwritten by panEDTA. To help filtering out gene-related sequences, at least one CDS file is required. Please read [wiki](https://github.com/oushujun/EDTA/wiki/Making-sense-of-EDTA-usage-and-outputs---Q&A) for the CDS requirement. You may want to check out the toy example in the ./test folder to get familiarized.
 
     sh panEDTA.sh -g genome_list.txt -c cds.fasta -t 10
@@ -231,6 +260,7 @@ This is the serial version of panEDTA. Each genome will be annotated sequentiall
 
 
 ## Benchmark
+
 If you developed a new TE method/got a TE library and want to compare it's annotation performance to the methods we have tested, you can:
 
 1.annotate the rice genome with your test library:
@@ -256,6 +286,7 @@ eg.
 Note: the -std and -tst files should be named differently even they are placed in different folders.
 
 ## Citations
+
 Please cite our paper if you find EDTA useful:
 
 Ou S., Su W., Liao Y., Chougule K., Agda J. R. A., Hellinga A. J., Lugo C. S. B., Elliott T. A., Ware D., Peterson T., Jiang N.✉, Hirsch C. N.✉ and Hufford M. B.✉ (2019). Benchmarking Transposable Element Annotation Methods for Creation of a Streamlined, Comprehensive Pipeline. [Genome Biol. 20(1): 275.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1905-y)
@@ -267,10 +298,13 @@ Ou S., Collins T., Qiu Y., Seetharam A., Menard C., Manchanda N., Gent J., Schat
 Please also cite the software packages that were used in EDTA, listed in the [EDTA/bin](./bin) directory.
 
 ## Other resources
+
 You may download the [rice genome here](http://rice.uga.edu/pub/data/Eukaryotic_Projects/o_sativa/annotation_dbs/pseudomolecules/version_7.0/all.dir/) (the "all.con" file).
 
 ## Questions and Issues
+
 You may want to check out this [Q&A page](https://github.com/oushujun/EDTA/wiki) for best practices and get answered. If you have other issues with installation and usage, please check if similar issues have been reported in [Issues](https://github.com/oushujun/EDTA/issues) or open a new issue. If you are (looking for) happy users, please read or write successful cases [here](https://github.com/oushujun/EDTA/issues/15).
 
 ## Acknowledgements
+
 I want to thank [Jacques Dainat](https://github.com/Juke34) for contribution of the EDTA conda recipe as well as improving the codes. I also want to thank [Qiushi Li](https://github.com/QiushiLi), [Zhigui Bao](https://github.com/baozg), [Philipp Bayer](https://github.com/philippbayer), [Nick Carleson](https://github.com/Neato-Nick), [@aderzelle](https://github.com/aderzelle), [Sanzhen Liu](https://github.com/liu3zhenlab), [Zhougeng Xu](https://github.com/xuzhougeng), [Shun Wang](https://github.com/wangshun1121), [Nancy Manchanda](https://github.com/nm100), [Eric Burgueño](https://github.com/eburgueno), [Sergei Ryazansky](https://github.com/DrHogart), and many more others for testing, debugging, and improving the EDTA pipeline.
diff --git a/cleanNXF.sh b/cleanNXF.sh
@@ -13,3 +13,6 @@ echo "Cleaned work..."
 rm -f .nf-test.log
 rm -rf .nf-test
 echo "Cleaned nf-test..."
+
+rm -rf null
+echo "Cleaned null..."
diff --git a/conf/base.config b/conf/base.config
@@ -0,0 +1,49 @@
+process {
+
+    cpus   = { 1      * task.attempt }
+    memory = { 6.GB   * task.attempt }
+    time   = { 4.h    * task.attempt }
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    // Process-specific resource requirements
+    // NOTE - Please try and re-use the labels below as much as possible.
+    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
+    //        If possible, it would be nice to keep the same label naming convention when
+    //        adding in your local modules too.
+    withLabel:process_single {
+        cpus   = { 1                   }
+        memory = { 6.GB * task.attempt }
+        time   = { 4.h  * task.attempt }
+    }
+    withLabel:process_low {
+        cpus   = { 2     * task.attempt }
+        memory = { 12.GB * task.attempt }
+        time   = { 4.h   * task.attempt }
+    }
+    withLabel:process_medium {
+        cpus   = { 6     * task.attempt }
+        memory = { 36.GB * task.attempt }
+        time   = { 8.h   * task.attempt }
+    }
+    withLabel:process_high {
+        cpus   = { 12    * task.attempt }
+        memory = { 72.GB * task.attempt }
+        time   = { 16.h  * task.attempt }
+    }
+    withLabel:process_long {
+        time   = { 20.h  * task.attempt }
+    }
+    withLabel:process_high_memory {
+        memory = { 200.GB * task.attempt }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -1,67 +1,67 @@
 process {
 
-    withName: 'EDTA:CUSTOM_SHORTENFASTAIDS' {
+    withName: 'OUSHUJUN_EDTA:EDTA:CUSTOM_SHORTENFASTAIDS' {
         publishDir = [
             path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,
             saveAs: { filename ->  ( filename.equals('versions.yml') || filename.endsWith('fasta') ) ? null : filename }
         ]
     }
 
-    withName: 'EDTA:LTRHARVEST' {
+    withName: 'OUSHUJUN_EDTA:EDTA:LTRHARVEST' {
         ext.args = '-size 1000000 -time 300'
         ext.prefix = { "${meta.id}_ltrharvest" }
     }
 
-    withName: 'EDTA:LTRFINDER' {
+    withName: 'OUSHUJUN_EDTA:EDTA:LTRFINDER' {
         ext.args = '-harvest_out -size 1000000 -time 300'
     }
 
-    withName: 'EDTA:CAT_CAT' {
+    withName: 'OUSHUJUN_EDTA:EDTA:CAT_CAT' {
         ext.prefix = { "${meta.id}_ltrharvest_ltrfinder.tabout" }
     }
 
-    withName: 'EDTA:ANNOSINE' {
+    withName: 'OUSHUJUN_EDTA:EDTA:ANNOSINE' {
         ext.prefix = { "${meta.id}.annosine" }
         ext.args = params.annosine_ext_args ?: '-a 2 --num_alignments 50000 -rpm 0 --copy_number 3 --shift 100 -auto 1'
     }
 
-    withName: 'EDTA:REPEATMODELER_REPEATMODELER' {
+    withName: 'OUSHUJUN_EDTA:EDTA:REPEATMODELER_REPEATMODELER' {
         ext.args = '-engine ncbi'
     }
 
-    withName: 'EDTA:TIRLEARNER' {
+    withName: 'OUSHUJUN_EDTA:EDTA:TIRLEARNER' {
         ext.prefix = { "${meta.id}.tirlearner" }
     }
 
-    withName: 'EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_DRAW' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_DRAW' {
         ext.args = '-pure_helitron'
     }
 
-    withName: 'EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_SCAN_HEAD_RC' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_SCAN_HEAD_RC' {
         ext.prefix = { "${meta.id}.rc" }
         ext.args = '--rc'
     }
 
-    withName: 'EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_SCAN_TAIL_RC' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_SCAN_TAIL_RC' {
         ext.prefix = { "${meta.id}.rc" }
         ext.args = '--rc'
     }
 
-    withName: 'EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_DRAW_RC' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FASTA_HELITRONSCANNER_SCAN_DRAW:HELITRONSCANNER_DRAW_RC' {
         ext.prefix = { "${meta.id}.rc" }
         ext.args = '-pure_helitron'
     }
 
-    withName: 'EDTA:FORMAT_HELITRONSCANNER_OUT' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FORMAT_HELITRONSCANNER_OUT' {
         ext.args = '-sitefilter 1 -minscore 12 -keepshorter 1 -extout 0'
     }
 
-    withName: 'EDTA:FORMAT_HELITRONSCANNER_OUT_EXT' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FORMAT_HELITRONSCANNER_OUT_EXT' {
         ext.args = '-sitefilter 1 -minscore 12 -keepshorter 1 -extlen 30 -extout 1'
     }
 
-    withName: 'EDTA:FINAL_FILTER' {
+    withName: 'OUSHUJUN_EDTA:EDTA:FINAL_FILTER' {
         publishDir = [
             path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -76,7 +76,7 @@ process {
         ]
     }
 
-    withName: 'EDTA:CUSTOM_RESTOREGFFIDS' {
+    withName: 'OUSHUJUN_EDTA:EDTA:CUSTOM_RESTOREGFFIDS' {
         publishDir = [
             path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,