forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_kraken.pl
executable file
·366 lines (258 loc) · 10.8 KB
/
run_kraken.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/perl
use warnings;
use strict;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
my $help;
my $version = "0.1";
my $version_marker;
my $parallel = 1;
my $db ;
my $out_dir = "kraken_output";
my $log = "kraken_log.txt";
my $report = "kraken_mpa_report.txt";
my $keep;
### delimiter used to identify sample name (either "_" or ".")
my $delimiter = "_";
### kraken parameters:
my $preload;
my $quick;
my $min_hits;
my $paired;
my $check_names;
### initialized variables not user-specified
my %s2f = (); ### samples as keys and fastqs as values (2 for PE and 1 for SE)
my @log_files = (); ### array to contain all log files, which will be looped over at the end
my @out_files = (); ### out files needed for kraken-mpa-report
my $kraken_options = ""; ### options to be passed to command-line
my $res = GetOptions("database|db=s"=>\$db,
"out_dir|o=s" => \$out_dir,
"log=s"=>\$log,
"thread:i"=>\$parallel,
"preload"=>\$preload,
"quick"=>\$quick,
"min-hits:i"=>\$min_hits,
"paired"=>\$paired,
"check-names"=>\$check_names,
"keep"=>\$keep,
"report=s"=>\$report,
"delimiter=s"=>\$delimiter,
"help|h"=>\$help,
"version|v"=>\$version_marker,
) or pod2usage(2);
pod2usage(-verbose=>2) if $help;
if ( $version_marker ) { print "version $version\n"; exit }
### the "2>&1" syntax means output STDERR to same output as STDOUT (i.e. same output as file descripter 1).
if ( index( `kraken 2>&1` , "Need to specify input filenames!" ) == -1 ) { die "Stopping job, because \"kraken\" is not in your path.\n"; }
if ( ! defined $db ) { die "need database directory\n"; }
if ( ( $delimiter ne "_" ) and ( $delimiter ne "." ) ) { die "delimiter $delimiter needs to be \"_\" or \".\"\n"; }
my @files = @ARGV;
### link sample names to fastqs
&fastqs2sample( @files );
pod2usage($0.': You must provide a list of fastq files.') unless @files;
# check that all files exist:
foreach my $f ( @files ) {
if ( ! -e $f ) { die "file $f doesn't exist\n"; }
}
system( "mkdir -p $out_dir");
if ( $preload ) { $kraken_options = $kraken_options . " --preload"; }
if ( $quick ) { $kraken_options = $kraken_options . " --quick"; }
if ( defined $min_hits ) { $kraken_options = $kraken_options . " --min-hits $min_hits"; }
if ( $paired ) { $kraken_options = $kraken_options . " --paired"; }
if ( $check_names ) { $kraken_options = $kraken_options . " --check-names"; }
foreach my $sample ( keys %s2f ) {
my @fastqs = @{ $s2f{$sample} };
my $out = "$out_dir" . "/" . $sample . "_out.txt";
my $log = "$out_dir" . "/" . $sample . "_out.log";
push( @out_files , $out );
push( @log_files , $log );
print STDERR "\nRunning:\nkraken --db $db $kraken_options --output $out @fastqs 2> $log\n";
system( "kraken --db $db $kraken_options --output $out @fastqs 2> $log" );
}
### now combine all kraken output files into mpa report:
print STDERR "\nRunning:\nkraken-mpa-report --db $db @out_files > $report\n";
system( "kraken-mpa-report --header-line --db $db @out_files > $report" );
my $report_spf = $report;
$report_spf =~ s/\.txt$/.spf/;
print STDERR "\nRunning:\nmetaphlan_to_stamp.pl $report > $report_spf\n";
system( "metaphlan_to_stamp.pl $report > $report_spf");
my $stamp = $report_spf;
$stamp =~ s/\.spf$/_rel-ab.spf/;
open( 'SPF' , '>' , $stamp ) or die "cant create new spf file $stamp\n";
my %counts = (); ### hash with samples as keys and total read matches as values
my @samples = ();
### read in file once to get sums:
my $lc = 0; # line count
open( 'REPORT' , '<' , $report_spf) or die "cant open kraken-mpa-report output in spf format $report_spf\n";
while( <REPORT> ) {
my @s = split( ' ' , $_ );
if ( $lc == 0 ) {
my @header = (splice( @s , 0 , 7 ) );
my $i = 0;
foreach my $s ( @s ) {
my ($name,$path,$suffix) = fileparse( $s , ("_out.txt") );
push( @header , $name );
$counts{$name} = 0;
push( @samples , $name );
++$i;
}
my $header = join(" " , @header );
print SPF "$header";
++$lc;
next;
}
splice( @s , 0 , 7 );
my $j = 0;
foreach my $s ( @s ) {
my $sample = $samples[$j];
$counts{$sample} += $s;
++$j;
}
} close( 'REPORT' );
$lc = 0; # line count
open( 'REPORT' , '<' , $report_spf ) or die "cant open kraken-mpa-report output in spf format $report_spf\n";
while( <REPORT> ) {
my @s = split( ' ' , $_ );
if ( $lc == 0 ) {
++$lc;
next;
}
my @line = splice( @s , 0 , 7 );
my $j = 0;
foreach my $s ( @s ) {
my $sample = $samples[$j];
push( @line , sprintf( "%.5f" , ($s/$counts{$sample})*100) );
++$j;
}
my $line = join( " " , @line );
print SPF "$line\n";
} close( 'REPORT' );
### create single log file for all samples by looping through individual log files
open( 'MASTERLOG' , '>' , $log ) or die "cant create MASTERLOG file $log\n";
print MASTERLOG "sample total_reads classified unclassified classified_percent unclassified_percent\n";
my $i = 0;
foreach my $err ( @log_files ) {
### read in log file for kraken jobs:
open( 'LOG' , '<' , $err ) or die "cant open LOG $err\n";
chomp( my @lines = <LOG> );
close( 'LOG' );
my ($name,$path,$suffix) = fileparse( $err , (".fq" , ".fastq") );
### scan 2nd last line of file for info on classified sequences:
$lines[$#lines-1] =~ m/(\d+) sequences classified \((\S+)%\)/;
my $classified = $1;
my $classified_percent = $2;
### scan last line of file for info on unclassified sequences:
$lines[$#lines] =~ m/(\d+) sequences unclassified \((\S+)%\)/;
my $unclassified = $1;
my $unclassified_percent = $2;
my $total = $classified + $unclassified;
print MASTERLOG "$name $total $classified $unclassified $classified_percent $unclassified_percent\n";
### remove individual log files for each vsearch job:
if ( ! $keep ) { system( "rm $err" ); }
++$i;
}
close( 'MASTERLOG' );
sub fastqs2sample {
foreach my $f ( @_ ) {
my @f_split = ();
if ( $delimiter eq "_" ) {
@f_split = split( '_' , basename($f) );
} elsif ( $delimiter eq "." ) {
@f_split = split( '\.' , basename($f) );
}
my $s = $f_split[0]; #sample
my $base = basename($f);
if ( ! exists $s2f{$s} ) {
my @dummy = ();
$s2f{$s} = \@dummy;
}
push( @{$s2f{$s}} , $f );
}
### keep track of #s of SE and PE samples, shouldn't be both types of samples
my $se_marker = 0;
my $pe_marker = 0;
### check that if there are 2 fastqs per sample that they are R1 and R2. Also, no more than 2 fastqs per sample.
foreach my $s ( keys %s2f ) {
my @f = @{$s2f{$s}};
my $num_fastq = scalar @f;
if ( $num_fastq == 1 ) {
++$se_marker;
### only 1 fastq for this sample, assuming it's SE and moving on.
} elsif ( $num_fastq == 2 ) {
++$pe_marker;
### assuming this sample has 2 fastqs because they are PE, check that either "_R1." or "_R1_" or "forward" is in one filename and "_R2." or "_R2_" or "reverse" is in the other's filename
if ( ( basename($f[0]) =~ m/_R1\.|_R1_|forward/ ) and ( basename($f[1]) =~ m/_R2\.|_R2_|reverse/ ) ) {
### the filenames seem to be PE reads and already ordered as forward and reverse, so continue
} elsif ( ( basename($f[1]) =~ m/_R1\.|_R1_|forward/ ) and ( basename($f[0]) =~ m/_R2\.|_R2_|reverse/ ) ) {
### filenames are PE, but reorder so they are forward and reverse
my @tmp = ($f[1] , $f[0] );
@{$s2f{$s}} = \@tmp;
} else {
die "the 2 fastqs for sample $s don't have PE read ids in their names, are you sure these are PE reads? They need _R1. and _R2. (or either _R1_ and _R2_ or forward and reverse) in their names. The filenames are: @f\n";
}
} elsif ( ($num_fastq < 1 ) or ( $num_fastq > 2 ) ) {
die "$num_fastq fastqs for sample $s. This script assumed that fastq prefixes (when delimited by \"$delimiter\") are sample IDs, check your filenames\n";
}
}
if ( ( $se_marker > 0 ) and ( $pe_marker > 0 ) ) {
die "Stopping job - $se_marker SE sample(s) and $pe_marker PE sample(s) input. You should run SE and PE samples through this script separately\n";
} elsif ( ( $se_marker > 0 ) and ( $paired ) ) {
die "Stopping job - $se_marker SE sample(s), but '--paired' option used. You should run SE and PE samples through this script separately\n";
} elsif ( ( $pe_marker > 0 ) and ( ! $paired ) ) {
die "Stopping job - $pe_marker PE sample(s), but '--paired' option was not used.\n";
}
}
=head1 Name
run_kraken.pl - wrapper to run kraken to do taxonomic classification of metagenomic reads along with the post-processing tools kraken-translate and kraken-mpa-report.
=head1 USAGE
run_kraken.pl [-log <logfile> -thread <#_CPU_to_use> -o <out_dir> -h -v --keep --preload --quick --min-hits <int> --report <outfile> --paired --delimiter <.|_>] --db <FASTA> <list of FASTA files>
NOTE: the "kraken" and "kraken-mpa-report" binaries need to be in your PATH.
Four output types will be created:
- a folder containing the output of each individual kraken job (in "kraken_output" by default)
- the output of kraken-mpa-report ("kraken_mpa_report.txt" by default)
- this report converted to STAMP format ("kraken_mpa_report.spf" by default)
- the above STAMP file converted into the relative abundance per sample ("kraken_mpa_report_rel-ab.spf" by default, which is what you would read into STAMP).
kraken website:
https://ccb.jhu.edu/software/kraken/
kraken paper:
Derrick E Wood and Steven L Salzberg
Kraken: ultrafast metagenomic sequence classification using exact alignments
Genome Biology 2014 15:R46
DOI: 10.1186/gb-2014-15-3-r46
=head1 OPTIONS
=over 4
=item B<-h, --help>
Displays the entire help documentation.
=item B<-v, --version>
Displays version number and exits.
=item B<-o, --out_dir <file>>
Output directory for kraken output (default is "kraken_output").
=item B<-thread <# of CPUs>>
Numbers of threads to use (1 by default).
=item B<-keep>
Flag to indicate that temporary log files should not be removed (useful for troubleshooting).
=item B<-log <file>>
the name of the log file - kraken_log.txt by default.
=item B<-db, --database <file>>
Database of genomes to use as a reference (FASTA file).
=item B<--preload >
kraken flag to preload database into memory for faster computation (off by default).
=item B<--quick >
kraken flag to indicate quick operation (off by default).
=item B<--min-hits <int> >
kraken parameter for number of hits required for classification.
=item B<--paired >
kraken parameter to indicate fastqs are PE.
=item B<--check-names >
kraken parameter to check that each pair of reads have names that agree with each other. Ignored if --paired is not specified.
=item B<--keep >
Flag to keep temporary logfiles rather than to delete them (useful for troubleshooting).
=item B<--report <FILE> >
kraken-mpa-format output file ("kraken_mpa_report.txt" by default)
=item B<-d,--delimiter <.|_>>
Either "." or "_" to use as a delimiter for filenames. Sample IDs are taken to be the first field of each filename. Default: "_".
=back
=head1 AUTHOR
Gavin Douglas <[email protected]>
=cut