-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add some helper scripts (only useful to MDU-PHL really)
- Loading branch information
Showing
2 changed files
with
192 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
#!/usr/bin/env perl | ||
use strict; | ||
use warnings; | ||
use Getopt::Long; | ||
use File::Spec; | ||
use Cwd qw(abs_path); | ||
use Spreadsheet::Read; | ||
use Data::Dumper; | ||
use Cwd; | ||
|
||
my $id_re = '\b(\d{4}-\d{5})\b'; | ||
my $read_re = '_R?([12])(?:_\d+)?.f'; | ||
my $in = ''; | ||
my $dir = '/mnt/seq/MDU/READS'; | ||
my $out = 'samples.tab'; | ||
my $longid = 0; | ||
my $verbose = 0; | ||
|
||
sub usage { | ||
print "$0 [--verbose] [--longid] [--fastqdir $dir] [--out $out] --in jobdetails.xlsx\n"; | ||
exit; | ||
} | ||
|
||
@ARGV or usage(); | ||
|
||
GetOptions( | ||
"help" => \&usage, | ||
"verbose!" => \&verbose, | ||
"in=s" => \$in, | ||
"fastqdir=s" => \$dir, | ||
"out=s" => \$out, | ||
"longid!" => \$longid, | ||
"id_regexp=s" => \$id_re, | ||
"read_regexp=s" => \$read_re, | ||
) | ||
or usage(); | ||
|
||
if (!$in and @ARGV > 0) { | ||
$in = shift @ARGV; | ||
print STDERR "Guessing --in $in\n"; | ||
} | ||
|
||
$in or die "need ID file with --in job.xls"; | ||
-r $in or die "can't read ID file '$in'"; | ||
|
||
$dir or die "need top-level folder containing FASTQ files with --dir"; | ||
-d $dir or die "--dir '$dir' is not a directory"; | ||
|
||
$out or die "need --out file to save results to"; | ||
|
||
# compile regexps | ||
$id_re = qr"$id_re"; | ||
$read_re = qr"$read_re"; | ||
|
||
print STDERR "Scanning '$in' for MDU sample IDs...\n"; | ||
|
||
my $book = ReadData( $in, cells=>0, strip=>3, attr=>1 ); | ||
#print Dumper($book); exit; | ||
my @row = Spreadsheet::Read::rows($book->[1]); | ||
my %id; | ||
for my $row (@row) { | ||
my $line = join(' ', grep { defined $_ } @$row); | ||
if ($line =~ $id_re) { | ||
my $ID = $1; | ||
$line =~ s/\s+/--/g; | ||
$line =~ s/[_-]+$//; | ||
$line =~ s/^[_-]+//; | ||
$id{$ID} = $line; | ||
} | ||
} | ||
|
||
printf STDERR "Found %d sample IDs:\n", scalar(keys %id); | ||
|
||
if (0 == keys %id) { | ||
print STDERR "ERROR: no IDs found in '$in'\n"; | ||
exit -1; | ||
} | ||
|
||
print STDERR map { "$_\n" } sort keys %id if $verbose; | ||
|
||
print STDERR "Scanning '$dir' for read files...\n"; | ||
|
||
my %want_id = (map { ($_ => 1) } keys %id); | ||
my %sample; | ||
|
||
open DIR, "find $dir -type f -name '*.f*q.gz' |"; | ||
while (my $file = <DIR>) { | ||
chomp $file; | ||
if ($file =~ $id_re and exists $want_id{$1}) { | ||
my $id = $1; | ||
my(undef, undef, $name) = File::Spec->splitpath($file); | ||
if ($name =~ $read_re) { | ||
my $read = $1; | ||
# print STDERR "$id $read\n"; | ||
$sample{$id}{$read} = abs_path($file); | ||
print STDERR "Found $id $read : $file\n" if $verbose; | ||
} | ||
else { | ||
print STDERR "WARNING: found $id but not $read_re in $name\n"; | ||
} | ||
} | ||
} | ||
|
||
print STDERR "Creating output file: $out\n"; | ||
open OUT, '>', $out; | ||
|
||
#use Data::Dumper; | ||
#print STDERR Dumper(\%id); | ||
|
||
for my $id (sort keys %id) { | ||
if (exists $sample{$id}{1} and exists $sample{$id}{2}) { | ||
my $label = $longid ? $id{$id} : $id; | ||
print STDERR "$id - both reads found, labelling as '$label'\n"; | ||
printf OUT join("\t", $label, $sample{$id}{1}, $sample{$id}{2})."\n"; | ||
} | ||
elsif (!exists $sample{$id}{1} and !exists $sample{$id}{2}) { | ||
print STDERR "$id - NO READ FOUNDS !!!\n"; | ||
} | ||
elsif (!exists $sample{$id}{1}) { | ||
print STDERR "$id - MISSING Read 1 FILE !!!\n"; | ||
} | ||
elsif (!exists $sample{$id}{2}) { | ||
print STDERR "$id - MISSING Read 2 FILE !!!\n"; | ||
} | ||
else { | ||
print STDERR "$id - THIS LINE SHOULD NEVER BE REACHED\n"; | ||
} | ||
} | ||
|
||
print STDERR "Result in '$out'\n"; | ||
print STDERR "Done.\n"; | ||
|
||
my $name = qx(basename `pwd`); | ||
chomp $name; | ||
my $cmd = "nullarbor.pl --name $name --outdir nullarbor --input samples.tab --cpus 4 --mlst FIXME --ref FIXME"; | ||
|
||
print STDERR "Your next command is probably this:\n$cmd\n"; | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env perl | ||
use warnings; | ||
use strict; | ||
use Spreadsheet::Read; | ||
use Data::Dumper; | ||
|
||
my(@Options, $verbose, $sep, $informat); | ||
setOptions(); | ||
|
||
my $book = ReadData( $ARGV[0], cells=>0, strip=>1 ); | ||
|
||
my @row = Spreadsheet::Read::rows($book->[1]); | ||
|
||
for my $row (@row) { | ||
my @r = map { $_ || '' } @$row; | ||
print join($sep, @r),"\n"; | ||
} | ||
|
||
#---------------------------------------------------------------------- | ||
# Option setting routines | ||
|
||
sub setOptions { | ||
use Getopt::Long; | ||
|
||
@Options = ( | ||
{OPT=>"help", VAR=>\&usage, DESC=>"This help"}, | ||
{OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose output"}, | ||
# {OPT=>"informat=s", VAR=>\$informat, DEFAULT=>"xlsx", DESC=>"Input format: xls xlsx csv ods sxc"}, | ||
{OPT=>"sep=s", VAR=>\$sep, DEFAULT=>"\t", DESC=>"Output separator"}, | ||
); | ||
|
||
#(!@ARGV) && (usage()); | ||
|
||
&GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage(); | ||
|
||
# Now setup default values. | ||
foreach (@Options) { | ||
if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { | ||
${$_->{VAR}} = $_->{DEFAULT}; | ||
} | ||
} | ||
} | ||
|
||
sub usage { | ||
print "Usage: $0 [options] [<] file.xlsx > file.csv\n"; | ||
foreach (@Options) { | ||
printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC}, | ||
defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; | ||
} | ||
exit(1); | ||
} | ||
|
||
#---------------------------------------------------------------------- |