#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Cwd;
use File::Basename;
# My HPCC cluster requires jobs be staged to a dedicated Fiberchannel NFS
# to lessen strain on the standard NFS
my $DATA_HOME = "/datafc/vaughn";
my $BIN_HOME = "/home/bin/novocraft";
my $UTIL_HOME = "/home/vaughn/bin";
my $WORK_HOME = Cwd::cwd();
my $query_file = 'infile.fasta';
my $database_filepath = 'database.fasta';
my $out_file = 'out_file.sam';
my $binary = 'novoalign';
my $repeat = 1;
my $nocleanup = 0;
# Alignment params
my $mismatches_v = 0;
my $maxgap_g = 0;
# How many jobs to submit by default
# 128 cores will align a standard Illumina lane in 10 min
my $jobs = 128;
GetOptions(
"query1|query=s" => \$query_file,
"database=s" => \$database_filepath,
"output=s" => \$out_file,
"jobs=i" => \$jobs,
"defaults" => \$force_default,
"app=s" => \$binary,
"repeat=i" => \$repeat,
"gap=i" => \$maxgap_g,
"mismatch=i" => \$mismatches_v,
"noclean" => \$nocleanup
);
# Extract just the name of the subject file from its path
my $database_file = basename($database_filepath);
# Test for existence of aligner output in current directory
my $run_script = 1;
if ( -e "$out_file" ) {
if (
promptUserBoolean(
"Alignment file $out_file is already present. Re-compute?", "N"
)
)
{
my $t = time();
system("mv $out_file $out_file.$t");
report("$out_file has been backed up as $out_file.$t");
$run_script = 1;
}
else {
$run_script = 0;
}
}
if ($run_script) {
# Make output directories on data volume
my $OUTDIR = "$DATA_HOME/$binary." . time();
report("\tStep 1: Initialize $OUTDIR");
unless ( -e "$OUTDIR" ) {
system("mkdir $OUTDIR");
report("Created $OUTDIR");
}
foreach my $d qw(fasta stderr appout script) {
unless ( -e "$OUTDIR/$d" ) {
system("mkdir $OUTDIR/$d");
}
}
report("\tStep 2: Copy database and prepare input files");
# Most NGS aligners require that the subject sequence be pre-indexed.
# Novoalign is no exception
# I index locally then copy the indices to a central shared filesystem
# This allows the indices to be re-used, an important point since
# the process can be time-intensive.
unless ( -e "$database_filepath.ndx" ) {
report("$database_filepath.ndx was not found. Generating...");
system("$BIN_HOME/novoindex $database_filepath.ndx $database_filepath");
}
system("cp $database_filepath.ndx $OUTDIR");
# This says splitfasta but it actually detects FASTA vs FASTQ and splits the file
# accordingly. So both filetypes are supported by this script.
system("$UTIL_HOME/splitfasta.pl -f $query_file -o $OUTDIR/fasta -n $jobs");
# Autocreate Sun Grid Engine script. Look Ma, I'm metaprogramming!
report("\tStep 3: Build submit script");
open( SHELL, ">$OUTDIR/run_align.sh" );
print SHELL "#!/bin/sh\n";
print SHELL "#\$ -l virtual_free=1.9G\n";
print SHELL "#\$ -cwd\n";
# The alignment command line is built specifically for the aligner.
print SHELL
"$BIN_HOME/$binary -f fasta/query.\$SGE_TASK_ID -o SAM -r Random -s 9 -d $database_file.ndx > appout/$binary.\$SGE_TASK_ID\n";
# Submission via qsub
report("\tStep 4: Submitting to SGE");
my $sync_str = "yes";
# synchronous submission inside a perl shell thread
system(
"cd $OUTDIR; qsub -sync $sync_str -t 1-$jobs:1 -N grid_$binary -S /bin/sh -cwd -o stderr/sge.out -e stderr/sge.err run_align.sh; cd $WORK_HOME"
);
report("\tStep 5: Compiling results into single file");
if ( -e "$OUTDIR/appout/$binary.1" ) {
system("cat $OUTDIR/appout/$binary.* >> $OUTDIR/$out_file");
}
else {
report(
"\tSomething is awry, as $OUTDIR/appout/$binary.* can't be located"
);
exit 1;
}
report("Step 6: Copying results to $WORK_HOME");
if ( -e "$OUTDIR/$out_file" ) {
system("mv $OUTDIR/$out_file $WORK_HOME/$out_file");
}
else {
report("\tSomething is awry, as $OUTDIR/$out_file can't be located");
exit 1;
}
report("\tStep 7: Clean up $OUTDIR");
unless ($nocleanup) {
foreach my $d qw(fasta stderr appout script) {
system("rm -rf $OUTDIR/$d");
}
}
}
sub promptUserBoolean {
my ( $promptString, $defaultValue ) = @_;
if ($defaultValue) {
print $promptString, " [", $defaultValue, "]: ";
}
else {
print $promptString, ": ";
}
$| = 1; # force a flush after our print
$_ = <STDIN>; # get the input from STDIN (presumably the keyboard)
chomp;
my $answer = $_ ? $_ : $defaultValue;
if ( $answer =~ /True|T|Yes|Y/i ) {
return 1;
}
else {
return 0;
}
}
sub report {
my $text = shift;
print STDERR $text, "\n";
return 1;
}