#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/soldb/local_scripts/custom_packages';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $out_dir=$args{'o'};
my $in_dir=$args{'i'};
my $project=$args{'p'};
my $treshold_qual=$args{'q'};

$project or die  "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#set defaults for io, script filename, etc
$out_dir ||= "/tmp/";
$in_dir ||= "/tmp/";
$treshold_qual ||= '15';


#uniform trailing slashes
$out_dir=~/\/$/ or $out_dir.='/';
$in_dir=~/\/$/ or $in_dir.='/';

my $seq_in_file=$in_dir."seqs_fasta.screen";
my $qual_in_file=$in_dir."seqs_fasta.screen.qual";
my $seq_out_file=$out_dir."seqs_fasta.trimmed";
my $qual_out_file=$out_dir."seqs_fasta.trimmed.qual";

my $script_file=__FILE__;


#main body of script
#####################

my $start_time=time;

my %seq_hash=();
my %qual_hash=();

#get the sequences and quality and hash them

my ($defline, $dataline);

open SEQ_FILEIN, $seq_in_file;
foreach (<SEQ_FILEIN>){
    chomp;
    if (/^>([^\s]+)[\s\n]/){ 
	if ($defline){
	    $seq_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$seq_hash{$defline}=$dataline;
close SEQ_FILEIN;

($defline,$dataline)=('','');

open QUAL_FILEIN, $qual_in_file;

foreach (<QUAL_FILEIN>){
    chomp;
    if (/^>([^\s]+)[\s\n]/){ 

	if ($defline){
	    $qual_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    $dataline.=$_;
}
$qual_hash{$defline}=$dataline;
close QUAL_FILEIN;


# try to open the database link
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);


open SEQ_FILEOUT, ">$seq_out_file";
open QUAL_FILEOUT, ">$qual_out_file";

my ($seq_id, $seq_group_id, $front_vector_end, $tail_vector_start, $front_trim);

foreach $seq_id (keys %seq_hash){

######DEBUG######
    print "$seq_id:\n";


#initialize the vector trim index
    $front_vector_end=0;
    $tail_vector_start=(length $seq_hash{$seq_id})-1;
    $front_trim=0;

#get the adaptor sequence and seq_group id from the library information
my $adaptor_seq;

    $stm="select l.adaptor_sequence, r.seq_id from raw_sequence as r left join est_info as i on r.seq_id=i.seq_id left join est_library as l on i.est_library_id=l.est_library_id where raw_seq_id='$seq_id'";
    $sth = $dbh->prepare($stm) 
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$adaptor_seq, \$seq_group_id);
    $sth->fetch; 

my @adaptor=split(//,$adaptor_seq);
my @adaptor_variations;

#######DEBUG########
print "$adaptor_seq\n";

#generate the possible adaptor variations
#we allow for 2 mismatches or 1 deletion and one mismatch 
my ($i,$j);
for ($i=0;$i<@adaptor;$i++){
    my @variation=@adaptor;
    $variation[$i]='[ACGTN]?';
    for ($j=0;$j<@variation;$j++){
	$j==$i and next;
	my @var2=@variation;
	$var2[$j]='[ACGTN]';

	my $current_variation = join ('', @var2);

	push @adaptor_variations, $current_variation;
    }
}

#generate the sequence and quality arrays
    my @seq=split(//,$seq_hash{$seq_id});
    my @qual= split /\s/, $qual_hash{$seq_id};

    my $vector='no';
    my %nov_pieces=();
    my ($best_seq, $best_qual);


#trim any vector right at start or end
    while ($seq[0] eq 'X'){
	shift @seq;
	shift @qual;
	$front_vector_end++;
	$front_trim++;
    }
    while ($seq[-1] eq 'X'){
	pop @seq;
	pop @qual;
	$tail_vector_start--;
    }


    $best_seq=join ('', @seq);
    $best_qual=join(' ', @qual);
    
    
    if((join ('', @seq))=~/X/){
#find internal vectors

########DEBUG########
	print "Looking for internal vectors:\n";

#start with no-vector, since we trimmed the end vectors already
	push @{$nov_pieces{'start'}}, 0;
    

	for ($i=0; $i<@seq; $i++){
	    
	    if ($seq[$i] eq 'X'){
		if($vector eq 'no'){ 
		    push @{$nov_pieces{'stop'}}, $i-1;
		    $vector='yes';
		    next;
		}
	    }
	else{
	    if($vector eq 'yes'){
		push @{$nov_pieces{'start'}}, $i;
		$vector='no';
	    }
	}
	}
	
#end with no-vector, since we trimmed the end vectors already
	push @{$nov_pieces{'stop'}}, @seq-1;
	
	
	
	
	
#find the best stretch starting with adaptor seq 
#or with best avg quality 
#or with longest stretch 
	
	my $seq_start=0;
	my $seq_end=0;
	my ($best_seq_start, $best_seq_end);
	my @nov_adaptor;  #contains start and end of vector flanked region after adaptor excising
	
	for ($j=0; $j<@{$nov_pieces{'start'}}; $j++){
	    $seq_start=$nov_pieces{'start'}[$j];
	    $seq_end=$nov_pieces{'stop'}[$j];
	    
#look for adaptor seq within 2*length of adaptor from start of fragment
	    my $adaptor_window_ln = 2 * length $adaptor_seq;
	    my $window_start=$seq_start;
	    my $window_end=$seq_start+$adaptor_window_ln;
	    $window_end > $seq_end and $window_end=$seq_end;
	    
	    my $search_target=join ('', @seq[$window_start..$window_end]);
	    my ($end_adaptor_position,$start_adaptor_position);
	    
	    my $search_pattern;
	    foreach $search_pattern (@adaptor_variations){
		if($search_target=~s/($search_pattern)/\%$1\#/){
		    $start_adaptor_position=index $search_target, '%';
		    $end_adaptor_position=(index $search_target, '#')-2;
		    my $clean_start=$seq_start+$end_adaptor_position;
		    push @nov_adaptor, $clean_start, $seq_end;
		    last; 
		}
	    }
	}


	my @fragments_to_eval;
	
	
#if there was only one fragment w/ adaptor seq
	if (@nov_adaptor == 2){
	    $best_seq_start=$nov_adaptor[0];
	    $best_seq_end=$nov_adaptor[1];

#######DEBUG#########
	    print "found single adaptor, keeping $best_seq_start to $best_seq_end\n";

	}
#if there was more than one fragment w/ adaptor seq
	elsif (@nov_adaptor > 2){
	    @fragments_to_eval=@nov_adaptor;

#######DEBUG##########
	    print "found more than one fragment w/ adaptor: " . (join (',',@nov_adaptor)) . "\n";

	}
#if there was no fragment w/ adaptor seq
	else{
	    my $index=0;
	    while ($index < @{$nov_pieces{'start'}}){	
		push @fragments_to_eval, $nov_pieces{'start'}[$index], $nov_pieces{'stop'}[$index];
		$index++;
	    }
#######DEBUG##########
		print "No adaptor found, trying all fragments: " . join (',', @fragments_to_eval) . "\n";

	}
	
	
	if(@fragments_to_eval){
	    
	    my ($max_avg_qual, $avg_qual) = (0,0);
	    my $index=0;
	    
	    while($index<@fragments_to_eval){
		$seq_start=$fragments_to_eval[$index];
		$index++;
		$seq_end=$fragments_to_eval[$index];
		$index++;
			
#skip 0 length fragments	    
		unless ($seq_end > $seq_start){
		    next;
		}

#look for good avg quality
		my $sum=0;
		for ($i=$seq_start;$i<=$seq_end;$i++){
		    $sum+=$qual[$i];
		}
		
		$avg_qual=$sum/($seq_end-$seq_start);

########DEBUG########
		print "AVG=$avg_qual for $seq_start to $seq_end\n";

		if($max_avg_qual<$avg_qual){
		    $max_avg_qual=$avg_qual;
		    $best_seq_start=$seq_start;
		    $best_seq_end=$seq_end;
		}
		
	    }
	    
########DEBUG########
	    print "Best Avg: $max_avg_qual for $best_seq_start to $best_seq_end\n";

	    
#if the max avg qual is below the treshold, use the longest stretch instead
	    if ($max_avg_qual < $treshold_qual){
		my $max_ln=0;
	    
		while($index<@fragments_to_eval){
		    my $ln=$fragments_to_eval[$index+1] - $fragments_to_eval[$index];
		    
		    if($max_ln < $ln){
			$max_ln = $ln;
			$best_seq_start=$fragments_to_eval[$index];
			$best_seq_end=$fragments_to_eval[$index+1];
		    }
		    $index+=2;
		}

########DEBUG########
		print "Avg too low, best length: $max_ln for $best_seq_start to $best_seq_end";
	    }
	    
	}	

	$best_seq=join('',@seq[$best_seq_start..$best_seq_end]);
	$best_qual=join(' ',@qual[$best_seq_start..$best_seq_end]);

#set vector trim values
	$front_vector_end=$best_seq_start+$front_trim;
	$tail_vector_start=$best_seq_end+$front_trim;
    }

########DEBUG##########
    print "vector trimming to $front_vector_end and from $tail_vector_start\n";
    print '#' x 80 . "\n";
	
	
#print out the new values
    print SEQ_FILEOUT ">$seq_id - vector trimmed \n$best_seq\n";
    print QUAL_FILEOUT ">$seq_id - vector trimmed \n$best_qual\n"; 
    
	
#load trimming info into the database
    $stm="insert into vector_trim (raw_seq_id, seq_id, front_trim_end, tail_trim_start) values ('$seq_id', '$seq_group_id', '$front_vector_end', '$tail_vector_start')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    
}

    

close SEQ_FILEOUT;
close QUAL_FILEOUT;
    

 db_link::disconnect_db($dbh);


 runtime::runtime_print($start_time, "Vector trimming");


