Example Data-parallel alignment - sge_novoalign.pl

#!/usr/bin/perl 

use strict;
use warnings;
use Getopt::Long;
use Cwd;
use File::Basename;

# My HPCC cluster requires jobs be staged to a dedicated Fiberchannel NFS
# to lessen strain on the standard NFS
my $DATA_HOME = "/datafc/vaughn";
my $BIN_HOME  = "/home/bin/novocraft";
my $UTIL_HOME = "/home/vaughn/bin";
my $WORK_HOME = Cwd::cwd();

my $query_file        = 'infile.fasta';
my $database_filepath = 'database.fasta';
my $out_file          = 'out_file.sam';
my $binary            = 'novoalign';
my $repeat            = 1;
my $nocleanup         = 0;

# Alignment params
my $mismatches_v = 0;
my $maxgap_g     = 0;

# How many jobs to submit by default
# 128 cores will align a standard Illumina lane in 10 min
my $jobs = 128;

GetOptions(
    "query1|query=s" => \$query_file,
    "database=s"     => \$database_filepath,
    "output=s"       => \$out_file,
    "jobs=i"         => \$jobs,
    "defaults"       => \$force_default,
    "app=s"          => \$binary,
    "repeat=i"       => \$repeat,
    "gap=i"          => \$maxgap_g,
    "mismatch=i"     => \$mismatches_v,
    "noclean"        => \$nocleanup
);

# Extract just the name of the subject file from its path
my $database_file = basename($database_filepath);

# Test for existence of aligner output in current directory
my $run_script = 1;
if ( -e "$out_file" ) {

    if (
        promptUserBoolean(
            "Alignment file $out_file is already present. Re-compute?", "N"
        )
      )
    {
        my $t = time();
        system("mv $out_file $out_file.$t");
        report("$out_file has been backed up as $out_file.$t");
        $run_script = 1;
    }
    else {
        $run_script = 0;
    }

}

if ($run_script) {

    # Make output directories on data volume

    my $OUTDIR = "$DATA_HOME/$binary." . time();
    report("\tStep 1: Initialize $OUTDIR");

    unless ( -e "$OUTDIR" ) {
        system("mkdir $OUTDIR");
        report("Created $OUTDIR");
    }
    foreach my $d qw(fasta stderr appout script) {
        unless ( -e "$OUTDIR/$d" ) {
            system("mkdir $OUTDIR/$d");
        }
    }

    report("\tStep 2: Copy database and prepare input files");

    # Most NGS aligners require that the subject sequence be pre-indexed.
    # Novoalign is no exception
    # I index locally then copy the indices to a central shared filesystem
    # This allows the indices to be re-used, an important point since
    # the process can be time-intensive.
    unless ( -e "$database_filepath.ndx" ) {
        report("$database_filepath.ndx was not found. Generating...");
        system("$BIN_HOME/novoindex $database_filepath.ndx $database_filepath");
    }
    system("cp $database_filepath.ndx $OUTDIR");

# This says splitfasta but it actually detects FASTA vs FASTQ and splits the file
# accordingly. So both filetypes are supported by this script.
    system("$UTIL_HOME/splitfasta.pl -f $query_file -o $OUTDIR/fasta -n $jobs");

    # Autocreate Sun Grid Engine script. Look Ma, I'm metaprogramming!
    report("\tStep 3: Build submit script");
    open( SHELL, ">$OUTDIR/run_align.sh" );
    print SHELL "#!/bin/sh\n";
    print SHELL "#\$ -l virtual_free=1.9G\n";
    print SHELL "#\$ -cwd\n";

    # The alignment command line is built specifically for the aligner.
    print SHELL
"$BIN_HOME/$binary -f fasta/query.\$SGE_TASK_ID -o SAM -r Random -s 9 -d $database_file.ndx > appout/$binary.\$SGE_TASK_ID\n";

    # Submission via qsub
    report("\tStep 4: Submitting to SGE");
    my $sync_str = "yes";

    # synchronous submission inside a perl shell thread
    system(
"cd $OUTDIR; qsub -sync $sync_str -t 1-$jobs:1 -N grid_$binary -S /bin/sh -cwd -o stderr/sge.out -e stderr/sge.err run_align.sh; cd $WORK_HOME"
    );

    report("\tStep 5: Compiling results into single file");
    if ( -e "$OUTDIR/appout/$binary.1" ) {
        system("cat $OUTDIR/appout/$binary.* >> $OUTDIR/$out_file");
    }
    else {
        report(
            "\tSomething is awry, as $OUTDIR/appout/$binary.* can't be located"
        );
        exit 1;
    }

    report("Step 6: Copying results to $WORK_HOME");
    if ( -e "$OUTDIR/$out_file" ) {
        system("mv $OUTDIR/$out_file $WORK_HOME/$out_file");
    }
    else {
        report("\tSomething is awry, as $OUTDIR/$out_file can't be located");
        exit 1;
    }

    report("\tStep 7: Clean up $OUTDIR");
    unless ($nocleanup) {
        foreach my $d qw(fasta stderr appout script) {
            system("rm -rf $OUTDIR/$d");
        }
    }
}

sub promptUserBoolean {

    my ( $promptString, $defaultValue ) = @_;

    if ($defaultValue) {
        print $promptString, " [", $defaultValue, "]: ";
    }
    else {
        print $promptString, ": ";
    }

    $| = 1;          # force a flush after our print
    $_ = <STDIN>;    # get the input from STDIN (presumably the keyboard)
    chomp;
    my $answer = $_ ? $_ : $defaultValue;

    if ( $answer =~ /True|T|Yes|Y/i ) {
        return 1;
    }
    else {
        return 0;
    }

}

sub report {

    my $text = shift;
    print STDERR $text, "\n";
    return 1;

}