From affdf5d3f0497fd89569c43b5eee4ef9d1ad779f Mon Sep 17 00:00:00 2001 From: Torsten Seemann Date: Thu, 12 Nov 2015 15:23:47 +1100 Subject: [PATCH] Add some helper scripts (only useful to MDU-PHL really) --- scripts/mdu-make_nullarbor_input.pl | 139 ++++++++++++++++++++++++++++ scripts/xlsx2tsv.pl | 53 +++++++++++ 2 files changed, 192 insertions(+) create mode 100755 scripts/mdu-make_nullarbor_input.pl create mode 100755 scripts/xlsx2tsv.pl diff --git a/scripts/mdu-make_nullarbor_input.pl b/scripts/mdu-make_nullarbor_input.pl new file mode 100755 index 0000000..a0f21a5 --- /dev/null +++ b/scripts/mdu-make_nullarbor_input.pl @@ -0,0 +1,139 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Getopt::Long; +use File::Spec; +use Cwd qw(abs_path); +use Spreadsheet::Read; +use Data::Dumper; +use Cwd; + +my $id_re = '\b(\d{4}-\d{5})\b'; +my $read_re = '_R?([12])(?:_\d+)?.f'; +my $in = ''; +my $dir = '/mnt/seq/MDU/READS'; +my $out = 'samples.tab'; +my $longid = 0; +my $verbose = 0; + +sub usage { + print "$0 [--verbose] [--longid] [--fastqdir $dir] [--out $out] --in jobdetails.xlsx\n"; + exit; +} + +@ARGV or usage(); + +GetOptions( + "help" => \&usage, + "verbose!" => \&verbose, + "in=s" => \$in, + "fastqdir=s" => \$dir, + "out=s" => \$out, + "longid!" => \$longid, + "id_regexp=s" => \$id_re, + "read_regexp=s" => \$read_re, +) +or usage(); + +if (!$in and @ARGV > 0) { + $in = shift @ARGV; + print STDERR "Guessing --in $in\n"; +} + +$in or die "need ID file with --in job.xls"; +-r $in or die "can't read ID file '$in'"; + +$dir or die "need top-level folder containing FASTQ files with --dir"; +-d $dir or die "--dir '$dir' is not a directory"; + +$out or die "need --out file to save results to"; + +# compile regexps +$id_re = qr"$id_re"; +$read_re = qr"$read_re"; + +print STDERR "Scanning '$in' for MDU sample IDs...\n"; + +my $book = ReadData( $in, cells=>0, strip=>3, attr=>1 ); +#print Dumper($book); exit; +my @row = Spreadsheet::Read::rows($book->[1]); +my %id; +for my $row (@row) { + my $line = join(' ', grep { defined $_ } @$row); + if ($line =~ $id_re) { + my $ID = $1; + $line =~ s/\s+/--/g; + $line =~ s/[_-]+$//; + $line =~ s/^[_-]+//; + $id{$ID} = $line; + } +} + +printf STDERR "Found %d sample IDs:\n", scalar(keys %id); + +if (0 == keys %id) { + print STDERR "ERROR: no IDs found in '$in'\n"; + exit -1; +} + +print STDERR map { "$_\n" } sort keys %id if $verbose; + +print STDERR "Scanning '$dir' for read files...\n"; + +my %want_id = (map { ($_ => 1) } keys %id); +my %sample; + +open DIR, "find $dir -type f -name '*.f*q.gz' |"; +while (my $file = ) { + chomp $file; + if ($file =~ $id_re and exists $want_id{$1}) { + my $id = $1; + my(undef, undef, $name) = File::Spec->splitpath($file); + if ($name =~ $read_re) { + my $read = $1; +# print STDERR "$id $read\n"; + $sample{$id}{$read} = abs_path($file); + print STDERR "Found $id $read : $file\n" if $verbose; + } + else { + print STDERR "WARNING: found $id but not $read_re in $name\n"; + } + } +} + +print STDERR "Creating output file: $out\n"; +open OUT, '>', $out; + +#use Data::Dumper; +#print STDERR Dumper(\%id); + +for my $id (sort keys %id) { + if (exists $sample{$id}{1} and exists $sample{$id}{2}) { + my $label = $longid ? $id{$id} : $id; + print STDERR "$id - both reads found, labelling as '$label'\n"; + printf OUT join("\t", $label, $sample{$id}{1}, $sample{$id}{2})."\n"; + } + elsif (!exists $sample{$id}{1} and !exists $sample{$id}{2}) { + print STDERR "$id - NO READ FOUNDS !!!\n"; + } + elsif (!exists $sample{$id}{1}) { + print STDERR "$id - MISSING Read 1 FILE !!!\n"; + } + elsif (!exists $sample{$id}{2}) { + print STDERR "$id - MISSING Read 2 FILE !!!\n"; + } + else { + print STDERR "$id - THIS LINE SHOULD NEVER BE REACHED\n"; + } +} + +print STDERR "Result in '$out'\n"; +print STDERR "Done.\n"; + +my $name = qx(basename `pwd`); +chomp $name; +my $cmd = "nullarbor.pl --name $name --outdir nullarbor --input samples.tab --cpus 4 --mlst FIXME --ref FIXME"; + +print STDERR "Your next command is probably this:\n$cmd\n"; + + diff --git a/scripts/xlsx2tsv.pl b/scripts/xlsx2tsv.pl new file mode 100755 index 0000000..a46d759 --- /dev/null +++ b/scripts/xlsx2tsv.pl @@ -0,0 +1,53 @@ +#!/usr/bin/env perl +use warnings; +use strict; +use Spreadsheet::Read; +use Data::Dumper; + +my(@Options, $verbose, $sep, $informat); +setOptions(); + +my $book = ReadData( $ARGV[0], cells=>0, strip=>1 ); + +my @row = Spreadsheet::Read::rows($book->[1]); + +for my $row (@row) { + my @r = map { $_ || '' } @$row; + print join($sep, @r),"\n"; +} + +#---------------------------------------------------------------------- +# Option setting routines + +sub setOptions { + use Getopt::Long; + + @Options = ( + {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, + {OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose output"}, +# {OPT=>"informat=s", VAR=>\$informat, DEFAULT=>"xlsx", DESC=>"Input format: xls xlsx csv ods sxc"}, + {OPT=>"sep=s", VAR=>\$sep, DEFAULT=>"\t", DESC=>"Output separator"}, + ); + + #(!@ARGV) && (usage()); + + &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage(); + + # Now setup default values. + foreach (@Options) { + if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { + ${$_->{VAR}} = $_->{DEFAULT}; + } + } +} + +sub usage { + print "Usage: $0 [options] [<] file.xlsx > file.csv\n"; + foreach (@Options) { + printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC}, + defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; + } + exit(1); +} + +#----------------------------------------------------------------------