#!/usr/bin/perl -w
use strict;


@ARGV or print "No input parameters, proceeding with default.\n";


# Input options are expected to be in the "-flag value" format 
# (for example: -o /tmp/ -i /tmp/  -n 12)
#
# Valid flags are:
# -i    directory holding the input file
# -o    directory where you want to output the results
# -n    minimum number of consecutive 'A' to be considered a polyA tail
# -k    maximum number of 'A' to keep in the polyA tail after trimming
#
# You can also change the name of the input 
# sequence and quality file in the variables below.
# The expected input file is a fasta file of 
# quality trimmed sequence, and its corresponding quality file.

my $in_fasta='seqs_fasta.trim';
my $out_fasta='seqs_fasta.polytrim';
my $in_qual='seqs_fasta.trim.qual';
my $out_qual='seqs_fasta.polytrim.qual';


my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val) = split /\s+/;
    $args{$flag} = $val;
}

my $out_dir = $args{'o'};
my $in_dir = $args{'i'};
my $nr_repeats = $args{'n'};
my $polyA_to_keep = $args{'k'};


#set defaults for io, script filename, etc
$out_dir ||= "/tmp/";
$in_dir ||= "/tmp/";
$nr_repeats ||= '12';
$polyA_to_keep ||= '20';



#uniform trailing slashes
$out_dir =~ /\/$/ or $out_dir .= '/';
$in_dir =~ /\/$/ or $in_dir .= '/';

my $seq_in_file = $in_dir . $in_fasta;
my $qual_in_file = $in_dir . $in_qual;
my $seq_out_file = $out_dir . $out_fasta;
my $qual_out_file = $out_dir . $out_qual;

my $script_file = __FILE__;


#main body of script
#####################

my %seq_hash = ();
my %qual_hash = ();

#get the sequences and quality and hash them

my ($defline, $dataline) = ('','');

open SEQ_FILEIN, $seq_in_file;
foreach (<SEQ_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $seq_hash{$defline} = $dataline;
	}
	$defline = $1;
	$dataline = '';
	next;
    }
    s/^\s*([^\s]+)\s*$/$1/;
    $dataline .= $_;
}
$seq_hash{$defline} = $dataline;
close SEQ_FILEIN;


($defline,$dataline) = ('','');

open QUAL_FILEIN, $qual_in_file;
foreach (<QUAL_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $qual_hash{$defline} = $dataline;
	}
	$defline = $1;
	$dataline = '';
	next;
    }
    $dataline .= $_;
}
$qual_hash{$defline} = $dataline;
close QUAL_FILEIN;


#open the output files and write to them as trimming progresses

open SEQ_FILEOUT, ">$seq_out_file";
open QUAL_FILEOUT, ">$qual_out_file";

foreach (keys %seq_hash){

    my $poly_trim = '';
    my $seq_string = $seq_hash{$_};
    my @seq = split(//,$seq_string);
    my @qual = split (/\s/, $qual_hash{$_});


# Find poly-A, ignoring other homopolymers since 
# quality trimming should have taken care of any issues with them

    my ($cut_start, $cut_ln);
    if($seq_string=~s/(A{$nr_repeats,})/\a$1\#/){

#if less polyA than min to keep, cut at end, else cut after to_keep As
	if ((length $1)<$polyA_to_keep){
	    $cut_start = (index $seq_string, "\#") - 1;
	}	    
	else {
	    $cut_start = (index $seq_string, "\a") + $polyA_to_keep;
	}

	$cut_ln = (length $seq_string)-$cut_start-1;
	splice @seq, $cut_start;
	splice @qual, $cut_start;
	$poly_trim = "- trimmed $cut_ln bp at poly-A tail";

    }

#print out the new values
    print SEQ_FILEOUT ">$_ $poly_trim \n" . join('', @seq) . "\n";
    print QUAL_FILEOUT ">$_ $poly_trim \n" . join(' ', @qual) . "\n"; 

}


close SEQ_FILEOUT;
close QUAL_FILEOUT;

