#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}


my $seq_in_file=$args{'i'};
my $qual_in_file=$args{'a'};
my $seq_out_file=$args{'o'};
my $qual_out_file=$args{'q'};
my $project=$args{'p'};
my $nr_repeats=$args{'n'};
my $polyA_to_keep=$args{'k'};

$project or die  "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
$seq_in_file ||= "/tmp/seqs_fasta.trim";
$qual_in_file ||= "/tmp/seqs_fasta.trim.qual";
$seq_out_file ||="/tmp/seqs_fasta.polytrim";
$qual_out_file ||="/tmp/seqs_fasta.polytrim.qual";
$nr_repeats ||= '12';
$polyA_to_keep ||= '20';


#main body of script
#####################

my $start_time=time;

my %seq_hash=();
my %qual_hash=();

#get the sequences and quality and hash them

my ($defline, $dataline);

open SEQ_FILEIN, $seq_in_file;
foreach (<SEQ_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $seq_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$seq_hash{$defline}=$dataline;
close SEQ_FILEIN;

($defline,$dataline)=('','');

open QUAL_FILEIN, $qual_in_file;

foreach (<QUAL_FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 

	if ($defline){
	    $qual_hash{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }

    $dataline.=$_;
}
$qual_hash{$defline}=$dataline;
close QUAL_FILEIN;

# try to open the database link
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

open SEQ_FILEOUT, ">$seq_out_file";
open QUAL_FILEOUT, ">$qual_out_file";

foreach (keys %seq_hash){

#get the seq_group_id and raw seq to find trim location
    my ($seq_group_id, $raw_seq);
    $stm="select seq_id, sequence_data from raw_sequence where raw_seq_id='$_'";
    $sth = $dbh->prepare($stm) 
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$seq_group_id, \$raw_seq);
    $sth->fetch; 
    

    my $poly_trim='';
    my @seq=split(//,$seq_hash{$_});
    my @qual= split /\s/, $qual_hash{$_};

    my $seq_string=$seq_hash{$_};

#find poly-A
#ignoring other homopolymers since quality trimming should have taken care of any issues with them
    my ($cut_start, $cut_ln);
    if($seq_string=~s/(A{$nr_repeats,})/\a$1\#/){
#if less polyA than min to keep, cut at end, else cut after to_keep As
	if ((length $1)<$polyA_to_keep){
	    $cut_start=(index $seq_string, "\#") - 1;
	}	    
	else {
	    $cut_start=(index $seq_string, "\a") + $polyA_to_keep;
	}

	$cut_ln=(length $seq_string)-$cut_start-1;
	splice @seq, $cut_start;
	splice @qual, $cut_start;
	$poly_trim="- trimmed $cut_ln bp at poly-A tail";

    }

#print out the new values
    print SEQ_FILEOUT ">$_ $poly_trim \n".join('',@seq)."\n";
    print QUAL_FILEOUT ">$_ $poly_trim \n".join(' ',@qual)."\n"; 

    if($cut_start){

#find the trimmed seq start first
	my $trimmed_start=index $raw_seq, join('',@seq);
	my $polya_trim_start=$trimmed_start+$cut_start;
	my $polya_trim_end=$polya_trim_start + $cut_ln -1;

	unless($polya_trim_start==$polya_trim_end){
#insert trimming info into the database
	    $stm="insert into polya_trim (raw_seq_id, seq_id, polya_trim_start, polya_trim_end) values ('$_', '$seq_group_id', '$polya_trim_start', '$polya_trim_end')";
	    $sth = $dbh->prepare($stm) 
		|| die "Can't prepare statement: $DBI::errstr";
	    $rv = $sth->execute
		|| die "Can't execute statement: $DBI::errstr";
	}
    }

}



close SEQ_FILEOUT;
close QUAL_FILEOUT;

 db_link::disconnect_db($dbh);

 runtime::runtime_print($start_time, "Homopolymer trimming");


