-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPARANOiD-deprecated-DSL1.nf
1198 lines (923 loc) · 37.4 KB
/
PARANOiD-deprecated-DSL1.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
import java.nio.file.*
import groovy.io.FileType
//Check for pipeline version. Pipeline is only executed if version is not requested
params.version = false
if(params.version){
println(workflow.manifest.version)
} else {
params.split_fastq_by = 1000000
params.speed = false
//essential inputs
input_reads = Channel.fromPath( params.reads ) //FASTQ file(s) containing reads
reference = Channel.fromPath( params.reference ) //FASTA file containing reference sequence(s)
barcode_file = Channel.fromPath( params.barcodes ) //TSV file containing experiment names and the corresponding experiemental barcode sequence
reference.into { reference_to_mapping; reference_to_extract_transcripts; reference_to_extract_sequences; reference_to_pureCLIP; reference_to_strand_preference; reference_to_chrom_sizes; reference_to_collect; reference_to_igv }
params.barcode_pattern = "NNNNNXXXXXXNNNN" //STRING containing barcode pattern -> N = random barcode; X = experimental barcode
val_barcode_pattern = Channel.from( params.barcode_pattern )
params.domain = "pro" //STRING decides if bowtie2 or STAR is used -> pro = bowtie2; eu = STAR
params.annotation = 'NO_FILE' //GFF/GTF file containing the annotation belonging to the reference
params.output = "./output/" //PATH leading to directory in which output is supposed to be stored
params.min_length = 30 //INT minimum length of reads required after preprocessing
params.min_qual = 20 //INT minimum quality of nucleotide required to retain it
params.min_percent_qual_filter = 90 //INT minimum percentage of nucleotides in a read required to have their quality above params.min_qual to rtain the read
params.barcode_mismatches = 1 //INT maximum number of mismatches in the experimental barcode sequence to still allocate the read
params.mapq = 2 //INT minimum mapq-score needed to retain an alignment -> soon to be deprecated
params.imgDir = "$PWD/work/images" //PATH directory to store singularity images
params.map_to_transcripts = false //BOOLEAN decides if top X sequences from the reference are presented in the output
params.number_top_transcripts = 10 //INT number of top transcripts presented when params.map_to_transcripts = true
params.merge_replicates = false
params.correlation_analysis = false
params.combine_strands_correlation = false
// parameters for peak calling
params.omit_peak_calling = false //BOOLEAN decides if peak calling via pureclip takes place after normal processing
params.peak_calling_for_high_coverage = false //BOOLEAN adds arguments to PureCLIP which allow the tool to run with BAM files containing coverages all over the reference
params.peak_calling_regions = false
params.peak_calling_regions_width = 8
// parameters for RNA subtypes
params.gene_id = "ID" //STRING name of gene_id used within the annotation file
params.color_barplot = "#69b3a2" //STRING color used for barplots
//params.rna_subtypes = 'lnc_RNA,miRNA,mRNA,ncRNA,rRNA,snoRNA,snRNA,tRNA'
params.rna_subtypes = '3_prime_UTR,transcript,5_prime_UTR' //STRING RNA-subtypes used for the distribution of CL-sites
Channel.from(params.rna_subtypes.split(',')).into{ rna_subtypes_to_feature_counts; rna_subtypes_to_distribution }
if(params.annotation != 'NO_FILE'){
annotation_file = Channel.fromPath( params.annotation )
annotation_file.into{annotation_to_RNA_subtypes_distribution;annotation_to_prepare_for_igv}
}
//parameters for peak distance
params.omit_peak_distance = false
params.percentile = 90 //INT percentile that decides which cl-sites are considered when calculating distances and extracting sequences
params.distance = 30 //INT maximum distance to check for distances between cl-sites
//params for sequence extraction
params.omit_sequence_extraction = false
params.seq_len = 20 //INT length to both sides of cl-sites from which nucleotides are recovered
params.omit_cl_nucleotide = false
params.max_motif_num = 50 // INT max number of motifs to search for
params.min_motif_width = 8 // INT minimum motif width to report, >=3
params.max_motif_width = 15 // INT maximum motif width to report, <= 30
// Check if the speed mode was being used. If so the fastq input file will be split every ${params.split_fastq_by} reads to greatly enhance the preprocessing speed
input_reads.into { input_reads_QC; input_reads_processing }
if (params.speed) {
input_reads_processing.splitFastq( by: params.split_fastq_by, file:true ).set{input_reads_processing}
}
sjdbGTFfile = file(params.annotation)
//FastQC v0.11.9
process quality_control {
tag {query.simpleName}
input:
file query from input_reads_QC
output:
file "${query.baseName}*" into out1, collect_statistics_qc1
"""
fastqc ${query} -o .
"""
}
process adapter_removal {
tag {query.simpleName}
input:
file query from input_reads_processing
output:
file "${query}_trimmed.fq" into reads_qualityFilter
file "${query}_trimming_report.txt" into collect_statistics_adapter
"""
trim_galore --cores ${task.cpus} --basename ${query} -o . --length ${params.min_length} ${query} --quality 0
"""
}
//FASTX Toolkit 0.0.14
process quality_filter {
tag {query.simpleName}
input:
file query from reads_qualityFilter
output:
file "${query.baseName}.qual-filter.fastq" into fastq_quality_filter_to_barcode_extraction, fastq_quality_filter_to_quality_control_2
file 'summary-quality-filter.txt' into collect_statistics_quality_filter
"""
fastq_quality_filter -v -q ${params.min_qual} -p ${params.min_percent_qual_filter} -i ${query} -o ${query.baseName}.qual-filter.fastq > summary-quality-filter.txt
"""
}
//FastQC v0.11.9
process quality_control_2 {
tag {query.simpleName}
input:
file query from fastq_quality_filter_to_quality_control_2.first()
output:
file "quality-control-2*" into qc_2_out, collect_statistics_qc2
"""
cat ${query} > quality-control-2.fastq
fastqc quality-control-2.fastq -o .
"""
}
process extract_rnd_barcode {
tag {query.simpleName}
input:
file query from fastq_quality_filter_to_barcode_extraction
output:
file "${query.baseName}.rndBarcode.fastq" into fastq_barcode_extraction_to_split_exp_barcode
file("${query.simpleName}.log") into collect_statistics_rnd_barcode_extraction
"""
umi_tools extract --stdin ${query} --bc-pattern ${params.barcode_pattern} --log ${query.simpleName}.log --stdout ${query.baseName}.rndBarcode.fastq
"""
}
process check_barcode_file {
input:
file "barcodes" from barcode_file
output:
file "checkedBarcodes" into checked_barcodes
"""
check_barcode_file.py barcodes > checkedBarcodes
"""
}
//FASTX Toolkit 0.0.14
process split_exp_barcode {
tag {query.simpleName}
input:
set file(query), file(barcodes) from fastq_barcode_extraction_to_split_exp_barcode.combine(checked_barcodes)
output:
file "barcode_split.log" into (collect_statistics_split,log_split_experimental_barcode)
file "output/*.fastq" into fastq_split_barcode_to_remove_exp_barcode
script:
if(params.barcode_mismatches == 0)
"""
mkdir output
cat ${query} | fastx_barcode_splitter.pl --bcfile ${barcodes} --bol --exact --prefix ./output/ --suffix .fastq > log_split.txt
"""
else
"""
mkdir output
cat ${query} | fastx_barcode_splitter.pl --bcfile ${barcodes} --bol --mismatches ${params.barcode_mismatches} --prefix ./output/ --suffix .fastq > barcode_split.log
"""
}
process get_length_exp_barcode {
input:
val(pattern) from val_barcode_pattern
output:
env(exp_barcode_length) into int_length_exp_barcode
"""
exp_barcode_length=\$((\$(echo -n ${pattern} | sed 's/[^Xx]//g' | wc -m) + 1))
"""
}
//FASTX Toolkit 0.0.14
process remove_exp_barcode {
tag {query.simpleName}
input:
set file(query), val(length_exp_barcode) from fastq_split_barcode_to_remove_exp_barcode.flatten().filter{ it.size() > 0 }.combine(int_length_exp_barcode)
output:
file "*.preprocessed.fastq" into fastq_remove_barcode_to_collect
"""
fastx_trimmer -f ${length_exp_barcode} -i ${query} -o ${query.baseName}.preprocessed.fastq
"""
}
fastq_remove_barcode_to_collect
.map{file -> tuple(file.name - ~/\.[\w.]+.fastq$/,file)}
.groupTuple()
.set{ fastq_collect_preprocessed_to_merge }
process merge_preprocessed_reads {
tag {name}
input:
set val(name), file("query") from fastq_collect_preprocessed_to_merge
output:
file("${name}.fastq") into fastq_merge_preprocessed_to_alignment
"""
cat ${query} > ${name}.fastq
"""
}
if ( params.domain == 'pro' || params.map_to_transcripts == true){
//bowtie2 version 2.3.5.1
process build_index_bowtie {
input:
file ref from reference_to_mapping
output:
set file("${ref}"), file("${ref}.*") into bowtie_index_build_to_mapping
"""
bowtie2-build ${ref} ${ref}
"""
}
process mapping_bowtie{
tag {query.simpleName}
input:
set file(ref), file(index) from bowtie_index_build_to_mapping.first()
file query from fastq_merge_preprocessed_to_alignment
output:
file "${query.baseName}.bam" into (bam_mapping_to_filter_empty, bam_mapping_to_sort)
file "${query.simpleName}.statistics.txt" into collect_statistics_mapping
"""
bowtie2 --no-unal -q -p ${task.cpus} -U ${query} -x ${ref} 2> ${query.simpleName}.statistics.txt | samtools view -bS - > ${query.baseName}.bam
"""
}
} else if ( params.domain == 'eu' ) {
//Version 2.7.3a
process build_index_STAR {
input:
file referenceGenome from reference_to_mapping
file gtf from sjdbGTFfile
output:
file index into star_index_build_to_mapping
script:
if(params.annotation == 'NO_FILE')
"""
mkdir index
STAR --runThreadN ${task.cpus} --runMode genomeGenerate --genomeDir ./index --genomeFastaFiles ${referenceGenome}
"""
else
"""
mkdir index
STAR --runThreadN ${task.cpus} --runMode genomeGenerate --genomeDir ./index --genomeFastaFiles ${referenceGenome} --sjdbGTFfile ${gtf}
"""
}
process mapping_STAR{
tag {query.simpleName}
input:
set file(query), file(indexDir) from fastq_merge_preprocessed_to_alignment.combine(star_index_build_to_mapping)
output:
file("${query.baseName}.Aligned.sortedByCoord.out.bam") into bam_mapping_to_filter_empty
file("${query.baseName}.Log.*") into collect_statistics_mapping
"""
STAR --runThreadN ${task.cpus} --genomeDir ${indexDir} --readFilesIn ${query} --outFileNamePrefix ${query.baseName}. --alignEndsType Extend5pOfRead1 --outSAMtype BAM SortedByCoordinate
"""
}
}
process filter_empty_bams{
tag {query.simpleName}
input:
file query from bam_mapping_to_filter_empty
output:
file "${query.baseName}.filtered.bam" optional true into bam_filter_empty_to_split
file "${query.simpleName}.no_alignments.txt" optional true into log_experiments_without_alignments
"""
if [[ \$(samtools view ${query} | wc -l) == 0 ]]; then
echo "${query.simpleName} contains no alignable reads" > ${query.simpleName}.no_alignments.txt
else
mv ${query} ${query.baseName}.filtered.bam
fi
"""
}
if( params.map_to_transcripts == false && params.speed ) {
process split_bam_by_chromosome{
tag {query.simpleName}
input:
file(query) from bam_filter_empty_to_split
output:
file("*.bam") into bam_split_to_sort
"""
bamtools split -in ${query} -reference
"""
}
} else {
bam_filter_empty_to_split.set{ bam_split_to_sort }
}
process sort_bam{
tag {query.simpleName}
input:
file query from bam_split_to_sort.flatten()
output:
set file("${query.baseName}.sorted.bam"), file("${query.baseName}.sorted.bam.bai") into bam_sort_to_deduplicate
"""
samtools sort ${query} -o ${query.baseName}.sorted.bam
samtools index ${query.baseName}.sorted.bam
"""
}
process deduplicate{
publishDir "${params.output}/statistics/PCR-deduplication", mode: 'copy', pattern: "${query.baseName}.deduplicated.log*"
tag {query.simpleName}
memory { 20.GB + 2.B * query.size() }
input:
set file(query), file(index) from bam_sort_to_deduplicate
output:
file "${query.baseName}.deduplicated.bam" into (bam_deduplicate_to_sort,bam_deduplicate_to_index)
file "${query.baseName}.deduplicated.log*" into (log_deduplicate_to_collect_statistics,log_deduplicate_to_output)
"""
umi_tools dedup --random-seed=42 -I ${query} --output-stats ${query.baseName}.deduplicated.log -S ${query.baseName}.deduplicated.bam
"""
}
bam_deduplicate_to_sort
.map{file -> tuple(file.name - ~/\.[\w.]+.bam$/,file)}
.groupTuple()
.set{ bam_dedup_sort_to_merge }
process merge_deduplicated_bam {
tag {name}
input:
set val(name), file(query) from bam_dedup_sort_to_merge
output:
file("${name}.bam") into (bam_merge_to_calculate_crosslinks, bam_merge_to_extract_transcripts, bam_merge_to_pureCLIP)
file("${name}.bam") into bam_alignment_to_sort
"""
samtools merge -c -p ${name}.bam ${query}
"""
}
process sort_and_index_alignment{
tag {query.simpleName}
publishDir "${params.output}/alignments", mode: 'copy', pattern: "${query.simpleName}.sorted.bam*"
input:
file(query) from bam_alignment_to_sort
output:
file("${query.simpleName}.sorted.bam") into bam_sorted_to_igv_session
file("${query.simpleName}.sorted.bam*")
"""
samtools sort ${query} > ${query.simpleName}.sorted.bam
samtools index ${query.simpleName}.sorted.bam
"""
}
if (params.map_to_transcripts == true){
process count_hits {
tag {bam.simpleName}
publishDir "${params.output}/transcripts/overview-hits", mode: 'copy'
input:
file bam from bam_merge_to_extract_transcripts
output:
file("${bam.baseName}.hits.tsv") into tsv_count_to_get_hits
"""
samtools view ${bam} | cut -f3 | sort | uniq -c | sort -nr > ${bam.baseName}.hits.tsv
"""
}
process get_top_hits {
publishDir "${params.output}/transcripts/overview-hits", mode: 'copy'
input:
file tsv from tsv_count_to_get_hits.flatten().toList()
output:
file("transcript-targets-top${params.number_top_transcripts}.txt") into (txt_get_hits_to_extract_alignments,txt_get_hits_to_extract_sequences)
"""
head -${params.number_top_transcripts} -q *.tsv | rev | cut -f1 -d' ' | rev | sort | uniq > transcript-targets-top${params.number_top_transcripts}.txt
"""
}
process index_alignments {
tag {bam.simpleName}
input:
file bam from bam_deduplicate_to_index
output:
set file("${bam.baseName}.sorted.bam"), file("${bam.baseName}.sorted.bam.bai") into bam_index_to_extract_alignments
"""
samtools sort ${bam} -o ${bam.baseName}.sorted.bam
samtools index ${bam.baseName}.sorted.bam
"""
}
process extract_top_alignments {
tag {bam.simpleName}
input:
set file(bam), file(bai), file(txt_alignments) from bam_index_to_extract_alignments.combine(txt_get_hits_to_extract_alignments)
output:
file("${bam.simpleName}_filtered_top${params.number_top_transcripts}.bam") into bam_extract_alignments_to_calc_crosslink
"""
samtools view -hb ${bam} `cat ${txt_alignments}` > ${bam.simpleName}_filtered_top${params.number_top_transcripts}.bam
"""
}
process remove_newlines {
input:
file ref from reference_to_extract_transcripts
output:
file("${ref.baseName}.removed_newlines.fna") into fasta_rm_newline_to_extract_sequences
"""
awk '!/^>/ {printf "%s", \$0; n = "\\n" } /^>/ { print n \$0; n = "" } END { printf "%s", n }' ${ref} > ${ref.baseName}.removed_newlines.fna
"""
}
process extract_top_transcript_sequences {
tag {txt_sequences.simpleName}
publishDir "${params.output}/transcripts", mode: 'copy'
input:
set file(txt_sequences), file(ref) from txt_get_hits_to_extract_sequences.combine(fasta_rm_newline_to_extract_sequences)
output:
file("${ref.simpleName}.top${params.number_top_transcripts}_transcripts.fna") into (top_transcripts_to_collect,top_transcripts_to_strand_preference)
"""
egrep -A1 --no-group-separator -f ${txt_sequences} ${ref} > ${ref.simpleName}.top${params.number_top_transcripts}_transcripts.fna
"""
}
bam_extract_alignments_to_calc_crosslink
.into{collected_bam_files; collected_bam_files_to_sort}
} else {
bam_merge_to_calculate_crosslinks
.into{collected_bam_files; collected_bam_files_to_sort}
}
process sort_bam_before_strand_pref {
tag {query.baseName}
input:
file(query) from collected_bam_files_to_sort
output:
file("${query.baseName}.sorted.bam") into bam_sort_to_group
"""
samtools sort ${query} > ${query.baseName}.sorted.bam
"""
}
if(params.merge_replicates == true){
bam_sort_to_group
.map{file -> tuple(file.name - ~/_rep_\d*(_filtered_top)?\d*(.sorted)?.bam$/,file)}
.groupTuple()
.set{grouped_bam_to_strand_preference}
} else {
bam_sort_to_group
.map{file -> tuple(file.name - ~/(_filtered_top\d*)?(.sorted)?.bam$/,file)}
.set{grouped_bam_to_strand_preference}
}
if (params.map_to_transcripts == true){
top_transcripts_to_strand_preference
.set{choose_correct_reference_file}
} else {
reference_to_strand_preference
.set{choose_correct_reference_file}
}
process determine_strand_preference {
tag {name}
publishDir "${params.output}/strand-distribution", mode: 'copy', pattern: "${name}.strand_proportion.txt"
input:
set val(name),file(query),file(reference) from grouped_bam_to_strand_preference.combine(choose_correct_reference_file)
output:
file("${name}.strand_proportion.txt") into txt_determine_strand_to_visualize
"""
egrep '^>' ${reference} | cut -f1 -d' ' | cut -c2- > references.txt
for i in ${query}
do
samtools index \$i
done
touch ${name}.strand_proportion.txt
echo -e "chromosome\tforward\treverse" >> ${name}.strand_proportion.txt
while read r; do
echo -e "\$r\t\$(for i in *.sorted.bam; do samtools view -F 20 -q ${params.mapq} \$i \$r | wc -l; done | paste -s -d+ | bc)\t\$(for i in *.sorted.bam; do samtools view -f 16 -q ${params.mapq} \$i \$r | wc -l; done | paste -s -d+ | bc)" >> ${name}.strand_proportion.txt;
done <references.txt
"""
}
process visualize_strand_preference {
publishDir "${params.output}/strand-distribution/visualization", mode: 'copy', pattern: "${strand.simpleName}.png"
input:
file(strand) from txt_determine_strand_to_visualize
output:
file("${strand.simpleName}.png")
"""
visualize_strand_distribution.R --input ${strand} --output ${strand.simpleName} --type png
"""
}
process get_chromosome_sizes{
input:
file(ref) from reference_to_chrom_sizes
output:
file("${ref.simpleName}.chromosome_sizes.txt") into (chrom_sizes_to_cross_link_calculation,chrom_sizes_to_bigWig,chrom_sizes_to_correlation)
"""
samtools faidx ${ref}
cut -f1,2 ${ref}.fai > ${ref.simpleName}.chromosome_sizes.txt
"""
}
process calculate_crosslink_sites{
tag {query.simpleName}
publishDir "${params.output}/cross-link-sites/wig", mode: 'copy', pattern: "${query.simpleName}_{forward,reverse}.wig"
input:
set file(query), file(chrom_sizes) from collected_bam_files.combine(chrom_sizes_to_cross_link_calculation)
output:
file "${query.simpleName}.wig2" optional true into (wig_calculate_crosslink_to_group_samples,wig2_calc_cl_sites_to_split)
set val("cross-link-sites"), file("${query.simpleName}_forward.wig"), file("${query.simpleName}_reverse.wig") optional true into wig_cross_link_sites_to_transform
file "${query.simpleName}_{forward,reverse}.wig" optional true into wig_calculate_cl_sites_to_split_strand
"""
create-wig-from-bam.py --input ${query} --mapq ${params.mapq} --chrom_sizes ${chrom_sizes} --output ${query.simpleName}.wig2
if [[ -f "${query.simpleName}.wig2" ]]; then
wig2-to-wig.py --input ${query.simpleName}.wig2 --output ${query.simpleName}
fi
"""
}
//From here on only further analyses
if( params.merge_replicates == true ){
//groups files according to their experiment
wig_calculate_crosslink_to_group_samples
.map{file -> tuple(file.name - ~/_rep_\d*(_filtered_top)?\d*.wig2$/,file)}
.groupTuple()
.set{wig2_grouped_samples_to_merge}
process split_wig2_for_correlation{
tag {query.simpleName}
cache false
input:
file(query) from wig2_calc_cl_sites_to_split
output:
file("${query.simpleName}_forward.wig") optional true into split_wig_forward_to_correlation
file("${query.simpleName}_reverse.wig") optional true into split_wig_reverse_to_correlation
file("${query.simpleName}_{forward,reverse}.wig") optional true into split_wig_both_to_correlation
"""
wig2-to-wig.py --input ${query} --output ${query.simpleName}
"""
}
if( params.correlation_analysis == true ){
if( params.combine_strands_correlation == true ){
both_value = Channel.from("both_strands")
split_wig_both_to_correlation
.flatten()
.map{file -> tuple(file.name - ~/_rep_\d*(_filtered_top)?\d*_(forward|reverse)?.wig$/,file)}
.groupTuple()
.combine(both_value)
.combine(chrom_sizes_to_correlation)
.set{prepared_input_correlation}
} else {
forward_value = Channel.from("forward")
split_wig_forward_to_correlation
.map{file -> tuple(file.name - ~/_rep_\d*(_filtered_top)?_forward.wig$/,file)}
.groupTuple()
.combine(forward_value)
.set{group_forward}
reverse_value = Channel.from("reverse")
split_wig_reverse_to_correlation
.map{file -> tuple(file.name - ~/_rep_\d*(_filtered_top)?_reverse.wig$/,file)}
.groupTuple()
.combine(reverse_value)
.set{group_reverse}
group_forward.concat(group_reverse)
.combine(chrom_sizes_to_correlation)
.set{prepared_input_correlation}
}
process calc_wig_correlation{
tag {name}
publishDir "${params.output}/correlation_of_replicates", mode: 'copy', pattern: "${name}_${strand}_correlation.{png,csv}"
input:
set val(name),file(query),val(strand),file(chrom_sizes) from prepared_input_correlation
output:
file("${name}_${strand}_correlation.png") optional true
file("${name}_${strand}_correlation.csv") optional true
script:
String[] test_size = query
if(params.combine_strands_correlation)
"""
if [[ ${test_size.size()} > 1 ]]; then
wig_file_statistics.R --input_path . --chrom_length ${chrom_sizes} --output ${name}_${strand} --both_strands --type png
fi
"""
else
"""
if [[ ${test_size.size()} > 1 ]]; then
wig_file_statistics.R --input_path . --chrom_length ${chrom_sizes} --output ${name}_${strand} --type png
fi
"""
}
}
process merge_wigs{
tag {name}
publishDir "${params.output}/cross-link-sites-merged/wig", mode: 'copy', pattern: "${name}_{forward,reverse}.wig"
input:
set name, file(query) from wig2_grouped_samples_to_merge
output:
file "${name}.wig2" into collected_wig_files
set val("cross-link-sites-merged"), file("${name}_forward.wig"), file("${name}_reverse.wig") optional true into wig_merged_cross_link_sites_to_transform
file "${name}_{forward,reverse}.wig" optional true into output
script:
if(name !=~ "unmatched.wig2")
"""
merge-wig.py --wig ${query} --output ${name}.wig2
wig2-to-wig.py --input ${name}.wig2 --output ${name}
"""
else
"""
merge-wig.py --wig ${query} --output unmatched.wig2
wig2-to-wig.py --input unmatched.wig2 --output unmatched
"""
}
} else {
wig_calculate_crosslink_to_group_samples
.set{collected_wig_files}
}
// Transformation of cross-link sites to different formats
if( params.merge_replicates == true ) {
wig_cross_link_sites_to_transform.mix(wig_merged_cross_link_sites_to_transform)
.set{wig_cross_link_sites_to_bigWig}
} else {
wig_cross_link_sites_to_transform
.set{wig_cross_link_sites_to_bigWig}
}
process wig_to_bigWig{
tag {forward.simpleName}
publishDir "${params.output}/${out_dir}/bigWig", mode: 'copy', pattern: "*.bw"
input:
set val(out_dir), file(forward), file(reverse), file(chrom_sizes) from wig_cross_link_sites_to_bigWig.combine(chrom_sizes_to_bigWig)
output:
file("*.bw") optional true
set val(out_dir), file("*.bw") optional true into big_wig_to_igv_session
set val(out_dir), file("${reverse.baseName}.bw") optional true into big_wig_reverse_to_convert_to_bedgraph
set val(out_dir), file("${forward.baseName}.bw") optional true into big_wig_forward_to_convert_to_bedgraph
"""
if [[ \$(cat ${forward} | wc -l) > 1 ]]; then
wigToBigWig ${forward} ${chrom_sizes} ${forward.baseName}.bw
fi
if [[ \$(cat ${reverse} | wc -l) > 1 ]]; then
wigToBigWig ${reverse} ${chrom_sizes} ${reverse.baseName}.bw
fi
"""
}
process bigWig_to_bedgraph{
tag {bigWig.simpleName}
publishDir "${params.output}/${out_dir}/bedgraph", mode: 'copy', pattern: "*.bedgraph"
input:
set val(out_dir), file(bigWig) from big_wig_forward_to_convert_to_bedgraph.mix(big_wig_reverse_to_convert_to_bedgraph)
output:
file("*.bedgraph")
"""
bigWigToBedGraph ${bigWig} ${bigWig.baseName}.bedgraph
"""
}
// Generate one channel per postprocessing analysis
collected_wig_files.into{ collected_wig_2_to_RNA_subtypes_distribution; collected_wig_2_to_sequence_extraction; collected_wig_2_to_peak_distance; collect_wig_2_to_peak_height_histogram }
if (params.omit_peak_calling == false){
process index_for_peak_calling {
tag{query.simpleName}
input:
file(query) from bam_merge_to_pureCLIP
output:
set file("${query.simpleName}.sorted.bam"), file("${query.simpleName}.sorted.bam.bai") into bambai_index_to_peak_calling
"""
samtools sort ${query} > ${query.simpleName}.sorted.bam
samtools index ${query.simpleName}.sorted.bam
"""
}
process pureCLIP {
tag{bam.simpleName}
cache false
errorStrategy 'ignore' //TODO: is supposed to be only temporal. Need to find a solution for: ERROR: Emission probability became 0.0! This might be due to artifacts or outliers.
publishDir "${params.output}/peak_calling", mode: 'copy', pattern: "${bam.simpleName}.pureCLIP_crosslink_{sites,regions}.bed"
input:
set file(bam), file(bai), file(ref) from bambai_index_to_peak_calling.combine(reference_to_pureCLIP)
output:
file("${bam.simpleName}.pureCLIP_crosslink_{sites,regions}.bed")
file("${bam.simpleName}.pureCLIP_crosslink_sites.bed") into bed_peak_calling_to_sequence_extraction
file("${bam.simpleName}.pureCLIP_crosslink_sites.params") into params_peak_calling_to_collect_statistics
script:
if(params.peak_calling_for_high_coverage == true && params.peak_calling_regions == true)
"""
pureclip -i ${bam} -bai ${bai} -g ${ref} -nt ${task.cpus} -o ${bam.simpleName}.pureCLIP_crosslink_sites.bed -or ${bam.simpleName}.pureCLIP_crosslink_regions.bed -dm ${params.peak_calling_regions_width} -mtc 5000 -mtc2 5000 -ld
"""
else if(params.peak_calling_for_high_coverage == true && params.peak_calling_regions == false)
"""
pureclip -i ${bam} -bai ${bai} -g ${ref} -nt ${task.cpus} -o ${bam.simpleName}.pureCLIP_crosslink_sites.bed -mtc 5000 -mtc2 5000 -ld
"""
else if(params.peak_calling_for_high_coverage == false && params.peak_calling_regions == true)
"""
pureclip -i ${bam} -bai ${bai} -g ${ref} -nt ${task.cpus} -o ${bam.simpleName}.pureCLIP_crosslink_sites.bed -or ${bam.simpleName}.pureCLIP_crosslink_regions.bed -dm ${params.peak_calling_regions_width}
"""
else if(params.peak_calling_for_high_coverage == false && params.peak_calling_regions == false)
"""
ln -s ${ref} ${ref.baseName}_symlink.fasta
pureclip -i ${bam} -bai ${bai} -g ${ref.baseName}_symlink.fasta -nt ${task.cpus} -o ${bam.simpleName}.pureCLIP_crosslink_sites.bed
"""
}
}
process split_wig_2_for_peak_height_hist {
tag {query.simpleName}
input:
file(query) from collect_wig_2_to_peak_height_histogram
output:
set val("${query.simpleName}"), file("${query.simpleName}_forward.wig"), file("${query.simpleName}_reverse.wig") optional true into split_wig2_to_generate_peak_height_histogram
"""
wig2-to-wig.py --input ${query} --output ${query.simpleName}
"""
}
process generate_peak_height_histogram {
tag {query}
publishDir "${params.output}/peak_height_distribution", mode: 'copy'
input:
set val(query), file(forward), file(reverse) from split_wig2_to_generate_peak_height_histogram
output:
file("${query}.png")
"""
generate-peak-height-histogram.R --input . --output ${query} --type png --color "${params.color_barplot}" --percentile ${params.percentile}
"""
}
if ( params.annotation != 'NO_FILE'){
process wig_to_bam {
tag {query.simpleName}
input:
file(query) from collected_wig_2_to_RNA_subtypes_distribution
output:
file("${query.baseName}.bam") into bam_convert_to_feature_counts
"""
wig-to-bam.py --input ${query} --output ${query.baseName}.bam
"""
}
process feature_counts {
tag {query.simpleName}
input:
set file(query), val(rna_subtypes), file(annotation) from bam_convert_to_feature_counts.combine(rna_subtypes_to_feature_counts).combine(annotation_to_RNA_subtypes_distribution)
output:
file("${query.simpleName}.${rna_subtypes}.tsv") into tsv_feature_counts_to_sort
"""
featureCounts -T ${task.cpus} -t ${rna_subtypes} -g ${params.gene_id} -a ${annotation} -R CORE -M -o ${query.simpleName} ${query}
mv ${query}.featureCounts ${query.simpleName}.${rna_subtypes}.tsv
"""
}
tsv_feature_counts_to_sort
.map{file -> tuple(file.name - ~/\.[\w.]+.tsv$/,file)}
.groupTuple()
.set{tsv_sort_to_calculate_distribution}
process get_RNA_subtypes_distribution {
tag {name}
publishDir "${params.output}/RNA_subtypes", mode: 'copy'
input:
val(subtypes) from rna_subtypes_to_distribution.collect()
set val(name), file(query) from tsv_sort_to_calculate_distribution
output:
file("${name}.subtypes_distribution.tsv") into (output_rna_subtypes_tsv,tsv_rna_subtypes_distribution_to_barplot)
file("${name}.subtype.log") optional true into log_subtype_warnings
script:
subtypes_as_string = subtypes.join(' ')
"""
calc-RNA-subtypes-distribution.py --input ${query} --rna_subtypes ${subtypes_as_string} --output ${name}.subtypes_distribution.tsv > ${name}.subtype.log
"""
}
process generate_RNA_subtypes_barplot {
publishDir "${params.output}/RNA_subtypes", mode: 'copy'
input:
file(query) from tsv_rna_subtypes_distribution_to_barplot
output:
file("${query.baseName}.png") into output_rna_subtypes_png
"""
RNA_subtypes_barcharts.R --input ${query} --output ${query.baseName} --type png --color "${params.color_barplot}"
"""
}
process collect_subtype_analysis_errors {
publishDir "${params.output}/statistics", mode: 'copy', pattern: 'subtype-analysis-warnings.txt'
input:
file(query) from log_subtype_warnings.flatten().toList()
output:
file("subtype-analysis-warnings.txt") optional true into output_log_subtype_analysis_warnings
"""
for i in ${query}
do
if [[ ! \$(cat \$i | wc -l) == 0 ]]; then
echo "File: \$i" >> subtype-analysis-warnings.txt
cat \$i >> subtype-analysis-warnings.txt
fi
done
"""
}
}
if (params.omit_sequence_extraction == false) {
if(params.omit_peak_calling == false){
bed_peak_calling_to_sequence_extraction.set{peaks_for_sequence_extraction}
} else {
collected_wig_2_to_sequence_extraction.set{peaks_for_sequence_extraction}
}
process sequence_extraction {
tag {query.simpleName}
publishDir "${params.output}/extracted_sequences", mode: 'copy', pattern: "*.extracted-sequences.*"
input:
set file(query),file(reference) from peaks_for_sequence_extraction.combine(reference_to_extract_sequences)
output:
file "*.extracted-sequences.fasta" optional true into extracted_sequences
file "*.extracted-sequences.*" optional true into extracted_sequences_output
script:
if(params.omit_cl_nucleotide == true && params.omit_peak_calling == true)
"""
wig2-to-wig.py --input ${query} --output ${query.baseName}
sequence-extraction.py --input ${query.baseName}*.wig --reference ${reference} --output ${query.baseName}.extracted-sequences.fasta --length ${params.seq_len} --percentile ${params.percentile} --omit_cl
"""
else if(params.omit_cl_nucleotide == true && params.omit_peak_calling == false)
"""
sequence-extraction.py --input ${query} --reference ${reference} --output ${query.baseName}.extracted-sequences.fasta --length ${params.seq_len} --percentile ${params.percentile} --omit_cl
"""
else if(params.omit_cl_nucleotide == false && params.omit_peak_calling == true)
"""
wig2-to-wig.py --input ${query} --output ${query.baseName}
sequence-extraction.py --input ${query.baseName}*.wig --reference ${reference} --output ${query.baseName}.extracted-sequences.fasta --length ${params.seq_len} --percentile ${params.percentile}
"""
else if(params.omit_cl_nucleotide == false && params.omit_peak_calling == false)
"""