#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/soldb/local_scripts/custom_packages';

#local packages to use
use runtime;
use db_link_coffee;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
  
  $_ or next;
  my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $db='cgn';
my $script_file=__FILE__;
    
#keeping window size odd makes it easier to calculate median
my $window_size=11; 
my $threshold=10;




#main body of script
#####################

my $start_time=time;

# try to open the database
my $dbh = db_link_coffee::connect_db($db);

my %sequences=();
my %quality=();
my %clone_name=();

#read in the sequences that haven't been evaluated
my $stm = "select s.seq_id, s.sequence_data, q.quality_values, o.external_id from est_info as e, sequence as s, sequence_read_quality as q, other_identifier as o where e.seq_id=s.seq_id and e.seq_id=q.seq_id and e.seq_id=o.local_db_id";
my $sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
my $rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
my ($seq_id, $seq, $qual, $clone);
my $rc = $sth->bind_columns(\$seq_id, \$seq, \$qual, \$clone);

while ($sth->fetch){
    $sequences{$seq_id}=$seq;
    $quality{$seq_id}=$qual;
    $clone_name{$seq_id}=$clone;
}


my %trimmed_seq=();
my %trimmed_qual=();
my $report_print;

foreach (keys %sequences){
    
    my $seq_id=$_;

#read the individual quality values to an array
    my @qual=split(/\s+/,$quality{$seq_id});
    
#trim low quality ends
#use a sliding window starting at ends

    my $med_qval;
    my $window_start;
    my $window_end;

#trim the beginning
####################

    $med_qval=0;
    $window_start=0;
    while ($med_qval <= $threshold){

	$window_end=$window_start+$window_size-1;
	my $last_window;

#adjust for end of data
	if($window_end >= @qual){
	    $window_end=(@qual-1);
	    $last_window='true';
	}

	my @window_qual= sort {$a <=> $b} @qual[$window_start..$window_end];

	if(@window_qual % 2){
	    $med_qval=$window_qual[(@window_qual-1)/2];
	}
	else{
	    $med_qval=($window_qual[(@window_qual/2)-1] + $window_qual[(@window_qual/2)])/2;
	}

	$last_window and last;

	$window_start++;

############DEBUG#############
#	print "$seq_id start median $med_qval\n";
#	print join ' ', @window_qual;
#	print "\n";


    }

    my $start_cut_index=$window_start-1;

    

#trim the end
#############
    $med_qval=0;
    $window_start=@qual-1;
    while ($med_qval <= $threshold){
	$window_end=$window_start-$window_size;
	my $last_window;

#adjust for start of data
	if($window_end <= 0){
	    $window_end=0;
	    $last_window='true';
	}


	my @window_qual= sort {$a <=> $b} @qual[$window_end..$window_start];

	if(@window_qual % 2){
	    $med_qval=$window_qual[(@window_qual-1)/2];
	}
	else{
	    $med_qval=($window_qual[(@window_qual/2)-1] + $window_qual[(@window_qual/2)])/2;
	}

	$last_window and last;
	$window_start--;

#########DEBUG############
#	print "$seq_id end median $med_qval\n";
#	print join ' ', @window_qual;
#	print "\n";
#

    }

my $end_cut_index=$window_start+1;
    


#write the trimmed seq and qual
    unless ($end_cut_index==(@qual-1) and $start_cut_index==0){

#check for trimming overlap
	if ($end_cut_index <= $start_cut_index){
	    $trimmed_seq{$seq_id}='';
	    $trimmed_qual{$seq_id}='';
	}
	else{
	    $trimmed_seq{$seq_id}=substr ($sequences{$seq_id}, $start_cut_index, $end_cut_index-$start_cut_index);
	    $trimmed_qual{$seq_id}= join (' ', splice (@qual, $start_cut_index, $end_cut_index-$start_cut_index));
	}
	

	my $seq_id=$_;
	my $seq_ln=length $sequences{$seq_id};
	my $trimming_info="trimmed from 0 to $start_cut_index and from $end_cut_index to " . ($seq_ln - 1);
	$start_cut_index > $end_cut_index and $trimming_info="entire sequence trimmed";
	$report_print.="$clone_name{$seq_id}, $seq_ln length, $trimming_info\n\n";
	$report_print.="original:  $sequences{$seq_id}\n\n";
	$trimmed_seq{$seq_id} and $report_print.="trimmed:  $trimmed_seq{$seq_id}\n\n";
	$report_print.="original: $quality{$seq_id}\n\n";
	$trimmed_qual{$seq_id} and $report_print.= "trimmed:  $trimmed_qual{$seq_id}\n\n";
	$report_print.= '#'x80 . "\n\n";

    }
}




my $nr_trimmed= int(keys %trimmed_seq);
print "\n$nr_trimmed sequences trimmed.\n\n";
print '*'x80 . "\n\n";
print $report_print;



#update the database
foreach (keys %trimmed_seq){
#    $stm = "update sequence set sequence_data='$trimmed_seq{$_}' where seq_id='$_'";
#    $sth = $dbh->prepare($stm) 
#	|| die "Can't prepare statement: $DBI::errstr";
#    $rv = $sth->execute
#	|| die "Can't execute statement: $DBI::errstr";

#    $stm = "update sequence_read_quality set quality_values='$trimmed_qual{$_}' where seq_id='$_'";
#    $sth = $dbh->prepare($stm) 
#	|| die "Can't prepare statement: $DBI::errstr";
#    $rv = $sth->execute
#	|| die "Can't execute statement: $DBI::errstr";

}




db_link_coffee::disconnect_db($dbh);



 runtime::runtime_print($start_time, "Database upload");


