#!/usr/bin/perl -w
use strict;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $out_dir=$args{'o'};
my $in_dir=$args{'i'};
my $adaptor_seq=$args{'a'};
my $treshold_qual=$args{'q'};
my $verbose=$args{'v'};


#set defaults for io, script filename, etc
$out_dir ||= "/tmp/";
$in_dir ||= "/tmp/";
$treshold_qual ||= '15';



$adaptor_seq or die "Please specify and adaptor using -a\n";

#uniform trailing slashes
$out_dir=~/\/$/ or $out_dir.='/';
$in_dir=~/\/$/ or $in_dir.='/';

my $seq_in_file=$in_dir."seqs_fasta.screen";
my $qual_in_file=$in_dir."seqs_fasta.screen.qual";
my $seq_out_file=$out_dir."seqs_fasta.trim";
my $qual_out_file=$out_dir."seqs_fasta.trim.qual";

my $script_file=__FILE__;


#main body of script
#####################

my $start_time=time;

my %seq_hash=();
my %qual_hash=();

#get the sequences and quality and hash them

$verbose and print "Parsing sequence files\n";

my ($defline, $dataline);

open SEQ_FILEIN, $seq_in_file;
foreach (<SEQ_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $seq_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$seq_hash{$defline}=$dataline;
close SEQ_FILEIN;

($defline,$dataline)=('','');

open QUAL_FILEIN, $qual_in_file;

foreach (<QUAL_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 

	if ($defline){
	    $qual_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    $dataline.=$_;
}
$qual_hash{$defline}=$dataline;
close QUAL_FILEIN;

if($verbose){
   print "Starting trimming.\n";
}


open SEQ_FILEOUT, ">$seq_out_file";
open QUAL_FILEOUT, ">$qual_out_file";
    
my @adaptor=split(//,$adaptor_seq);
my @adaptor_variations;


#generate the possible adaptor variations
#we allow for 1 mismatch
my $i;
for ($i=0;$i<@adaptor;$i++){
    my @variation=@adaptor;
    $variation[$i]='[ACGTN]';
    my $current_variation = join ('', @variation);
    push @adaptor_variations, $current_variation;
}


my $seq_id;
foreach $seq_id (keys %seq_hash){

    my @qual=split(/ /, $qual_hash{$seq_id});


    my (@seq_pieces, @qual_pieces);
    @seq_pieces=split(/X+/, $seq_hash{$seq_id});

    my ($best_seq_piece, $best_qual_piece, $best_adaptor)=('','','');


    my (@adaptor_seqs, @trimmed_seq_pieces, @trimmed_adaptor_seq_pieces, @trimmed_qual_pieces, @trimmed_adaptor_qual_pieces);

    foreach (@seq_pieces){

	$_ or next;

	my $start= index ($seq_hash{$seq_id}, $_);
	my $end= $start + (length $_) - 1;
	my @seq_piece=split(//,$_);
	my @qual_piece=@qual[$start..$end];


#look for adaptor seq within 2*length of adaptor from start of fragment
	my $adaptor_window_ln = 2 * length $adaptor_seq;
	my $window_start=0;
	($adaptor_window_ln > length) and $adaptor_window_ln = length ;
	my $window_end=$adaptor_window_ln-1;
	
	my $search_target=join ('', @seq_piece[$window_start..$window_end]);

	my ($end_adaptor_position,$start_adaptor_position);
	my $search_pattern;
	foreach $search_pattern (@adaptor_variations){
	    if($search_target=~s/($search_pattern)/\%$1\#/){
		$start_adaptor_position=index $search_target, '%';
		$end_adaptor_position=(index $search_target, '#')-2;
		push @adaptor_seqs,$1;
		last; 
	    }
	}

	if($end_adaptor_position){
	    splice @seq_piece,0,$end_adaptor_position+1;
	    splice @qual_piece,0,$end_adaptor_position+1;
	}

#find the best quality region given the treshold
	my ($i, $j, $i_best, $j_best, $sum, $sum_max)= (0,0,0,0,0,0);

	for ($j=0; $j<@qual_piece; $j++){
	    
#raise the zero floor to the treshold
	    my $adjusted_qual=$qual_piece[$j]-$treshold_qual;
	    $sum+=$adjusted_qual;
	    
	    if($sum > $sum_max){
		$sum_max=$sum;
		$i_best=$i;
		$j_best=$j;
	    }
	    if($sum < 0){
		$sum=0;
		$i=$j+1;
	    }
	    
	}
	
	my ($best_seq, $best_qual)=('','');
	
	if ($sum_max){
	    $best_seq=join('', @seq_piece[$i_best..$j_best]);
	    $best_qual=join(' ', @qual_piece[$i_best..$j_best]);
	}

	if ($end_adaptor_position){
	    push @trimmed_adaptor_seq_pieces, $best_seq;
	    push @trimmed_adaptor_qual_pieces, $best_qual;
	}
	else{
	    push @trimmed_seq_pieces, $best_seq;
	    push @trimmed_qual_pieces, $best_qual;
	}
    }

#select best piece from ones with adaptor if found, from all pieces if not
    if(@trimmed_adaptor_seq_pieces){
	my $adaptor_index=0;
	while($adaptor_index<@trimmed_adaptor_seq_pieces){
	    if((length $best_seq_piece) < (length $trimmed_adaptor_seq_pieces[$adaptor_index])){
		$best_seq_piece=$trimmed_adaptor_seq_pieces[$adaptor_index];
		$best_qual_piece=$trimmed_adaptor_qual_pieces[$adaptor_index];
		$best_adaptor=$adaptor_seqs[$adaptor_index];
	    }
	    $adaptor_index++;
	}
    }
    else{
	my $piece_index=0;
	while($piece_index<@trimmed_seq_pieces){
	    if((length $best_seq_piece) < (length $trimmed_seq_pieces[$piece_index])){
		$best_seq_piece=$trimmed_seq_pieces[$piece_index];
		$best_qual_piece=$trimmed_qual_pieces[$piece_index];
	    }
	    $piece_index++;
	}
    }


#print out the new values
    print SEQ_FILEOUT ">$seq_id - low quality trimmed \n$best_seq_piece\n";
    print QUAL_FILEOUT ">$seq_id - low quality trimmed \n$best_qual_piece\n"; 

}



close SEQ_FILEOUT;
close QUAL_FILEOUT;



