#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $seq_in_file=$args{'i'};
my $qual_in_file=$args{'a'};
my $seq_out_file=$args{'o'};
my $qual_out_file=$args{'q'};
my $project=$args{'p'};
my $treshold_qual=$args{'t'};
my $verbose=$args{'v'};

$project or die  "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#set defaults for io, script filename, etc
$seq_in_file ||="/tmp/seqs_fasta.screen";
$qual_in_file ||="/tmp/seqs_fasta.screen.qual";
$seq_out_file ||="/tmp/seqs_fasta.trim";
$qual_out_file ||="/tmp/seqs_fasta.trim.qual";
$treshold_qual ||= 15;


#main body of script
#####################

my $start_time=time;

my %seq_hash=();
my %qual_hash=();

#get the sequences and quality and hash them

$verbose and print "Parsing sequence files\n";

my ($defline, $dataline);

open SEQ_FILEIN, $seq_in_file or die "couldn't open $seq_in_file for read\n";
foreach (<SEQ_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $seq_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$seq_hash{$defline}=$dataline;
close SEQ_FILEIN;

($defline,$dataline)=('','');

open QUAL_FILEIN, $qual_in_file or die "couldn't open $qual_in_file for read\n";

foreach (<QUAL_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 

	if ($defline){
	    $qual_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    $dataline.=$_;
}
$qual_hash{$defline}=$dataline;
close QUAL_FILEIN;

my $nr_seq=int(keys %seq_hash);
my $nr_qual=int(keys %qual_hash);

unless ($nr_seq == $nr_qual){
    die "Seq - qual missmatch: $nr_seq sequences, $nr_qual quality value entries\n";
}

my $segment_time=time;
if($verbose){
 runtime::runtime_print($start_time, "Parsing inputs");
   print "Read $nr_seq sequences and $nr_qual quality value sets.\n";
   print "Starting trimming.\n";
}


# try to open the database link
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);


open SEQ_FILEOUT, ">$seq_out_file" or die "couldn't open $seq_out_file for write.\n";
open QUAL_FILEOUT, ">$qual_out_file" or die "couldn't open $seq_out_file for write.\n";

my $seq_id;
foreach $seq_id (keys %seq_hash){

#skip empty sequences
    unless($seq_hash{$seq_id}){
	$verbose and print "No sequence for $seq_id, skipping\n";
	next;
    }

    my $seq_group_id;
    my $adaptor_seq='';

#get the adaptor sequence and seq_group id from the library information
    $stm="select l.adaptor_sequence, r.seq_id from raw_sequence as r left join est_info as i on r.seq_id=i.seq_id left join est_library as l on i.est_library_id=l.est_library_id where raw_seq_id='$seq_id'";
    $sth = $dbh->prepare($stm) 
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$adaptor_seq, \$seq_group_id);
    $sth->fetch; 
    
    $qual_hash{$seq_id} or die "No qual values for $seq_id\n";

    my @qual=split(/ /, $qual_hash{$seq_id});


    my (@seq_pieces, @qual_pieces);
    @seq_pieces=split(/X+/, $seq_hash{$seq_id});

    my ($best_seq_piece, $best_qual_piece, $best_adaptor)=('','','');

#find out the indices for the screened pieces if any
    if ($seq_hash{$seq_id}=~/X/){

	my $seq=$seq_hash{$seq_id};
	$seq=~s/(X+)/\|$1\|/g;
	my @pcs=split(/\|/,$seq);
	my ($pcs_start,$pcs_end)=(0,0);
	foreach (@pcs){
	    $_ or next;
	    $pcs_end=$pcs_start+length;
	    if (/^X+$/){
#load screening info into the database
		$stm="insert into vector_pieces (raw_seq_id, seq_id, vector_start, vector_end) values ('$seq_id', '$seq_group_id', '$pcs_start', '$pcs_end')";
	    $sth = $dbh->prepare($stm) 
		|| die "Can't prepare statement: $DBI::errstr";
	    $rv = $sth->execute
		|| die "Can't execute statement: $DBI::errstr";
	    }
	    $pcs_start=$pcs_end;
	}
    }


    my (@adaptor_seqs, @trimmed_seq_pieces, @trimmed_adaptor_seq_pieces, @trimmed_qual_pieces, @trimmed_adaptor_qual_pieces);

    foreach (@seq_pieces){

	$_ or next;

	my $start= index ($seq_hash{$seq_id}, $_);
	my $end= $start + (length $_) - 1;
	my @seq_piece=split(//,$_);
	my @qual_piece=@qual[$start..$end];


# look for adaptor seq within 2*length of adaptor from start of fragment
# Added the GG part to the end of the adaptors in the database, and set the last G as a "wiggle bit" (dan, 2003/08/10)
#

# if no adaptor if found, skip this check (dan, 2004/04/10)

	if ($adaptor_seq){

	    my $adaptor_window_ln = 2 * length $adaptor_seq;
	    my $window_start=0;
	    ($adaptor_window_ln > length) and $adaptor_window_ln = length ;
	    my $window_end=$adaptor_window_ln-1;
	    
	    my $search_target=join ('', @seq_piece[$window_start..$window_end]);
	    
	    my ($end_adaptor_position,$start_adaptor_position);
	    my $search_pattern=$adaptor_seq;
	    
#set the last letter as a variable nucleotide
	    $search_pattern =~ s/[ATGC]$/\[ATGCN\]\{0,1\}/;
	    
	    
	    if($search_target=~s/($search_pattern)/\%$1\#/){
		$start_adaptor_position=index $search_target, '%';
		$end_adaptor_position=(index $search_target, '#')-2;
		push @adaptor_seqs,$1;
	    }


	
	    if($end_adaptor_position){
		splice @seq_piece,0,$end_adaptor_position+1;
		splice @qual_piece,0,$end_adaptor_position+1;
	    }

	}


#find the best quality region given the treshold
	my ($i, $j, $i_best, $j_best, $sum, $sum_max)= (0,0,0,0,0,0);

	for ($j=0; $j<@qual_piece; $j++){
	    
#raise the zero floor to the treshold

#######DEBUG########
#	    defined $qual_piece[$j] or die "The impossible happened: No qual_piece for $j, aborted";

	    my $adjusted_qual=$qual_piece[$j]-$treshold_qual;
	    $sum+=$adjusted_qual;
	    
	    if($sum > $sum_max){
		$sum_max=$sum;
		$i_best=$i;
		$j_best=$j;
	    }
	    if($sum < 0){
		$sum=0;
		$i=$j+1;
	    }
	    
	}
	
	my ($best_seq, $best_qual)=('','');
	
	if ($sum_max){
	    $best_seq=join('', @seq_piece[$i_best..$j_best]);
	    $best_qual=join(' ', @qual_piece[$i_best..$j_best]);
	}

	if ($end_adaptor_position){
	    push @trimmed_adaptor_seq_pieces, $best_seq;
	    push @trimmed_adaptor_qual_pieces, $best_qual;
	}
	else{
	    push @trimmed_seq_pieces, $best_seq;
	    push @trimmed_qual_pieces, $best_qual;
	}
    }

#select best piece from ones with adaptor if found, from all pieces if not
    if(@trimmed_adaptor_seq_pieces){
	my $adaptor_index=0;
	while($adaptor_index<@trimmed_adaptor_seq_pieces){
	    if((length $best_seq_piece) < (length $trimmed_adaptor_seq_pieces[$adaptor_index])){
		$best_seq_piece=$trimmed_adaptor_seq_pieces[$adaptor_index];
		$best_qual_piece=$trimmed_adaptor_qual_pieces[$adaptor_index];
		$best_adaptor=$adaptor_seqs[$adaptor_index];
	    }
	    $adaptor_index++;
	}
    }
    else{
	my $piece_index=0;
	while($piece_index<@trimmed_seq_pieces){
	    if((length $best_seq_piece) < (length $trimmed_seq_pieces[$piece_index])){
		$best_seq_piece=$trimmed_seq_pieces[$piece_index];
		$best_qual_piece=$trimmed_qual_pieces[$piece_index];
	    }
	    $piece_index++;
	}
    }


#print out the new values
    print SEQ_FILEOUT ">$seq_id - low quality trimmed \n$best_seq_piece\n";
    print QUAL_FILEOUT ">$seq_id - low quality trimmed \n$best_qual_piece\n"; 

#find position of best piece
    my $front_trim_end=index ($seq_hash{$seq_id}, $best_seq_piece)-1;
    my $tail_trim_start=$front_trim_end + (length $best_seq_piece) +1;

#insert trimming info into the database
    $stm="insert into quality_trim (raw_seq_id, seq_id, front_trim_end, tail_trim_start) values ('$seq_id', '$seq_group_id', '$front_trim_end', '$tail_trim_start')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


    if ($best_adaptor){
#find adaptor position
	my $adaptor_start=index ($seq_hash{$seq_id}, $best_adaptor);
	my $adaptor_end=$adaptor_start + (length $best_adaptor);

#insert adaptor info
	$stm="insert into adaptor_location (raw_seq_id, seq_id, adaptor_start, adaptor_end) values ('$seq_id', '$seq_group_id', '$adaptor_start', '$adaptor_end')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    }
}



close SEQ_FILEOUT;
close QUAL_FILEOUT;

 db_link::disconnect_db($dbh);

$verbose and runtime::runtime_print($segment_time, "Trimming");
 runtime::runtime_print($start_time, "Vector and quality trimming");


