#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';


#local packages to use
use runtime;
use db_link;
use projects;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
  
  $_ or next;
  my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $project=$args{'p'};
my $treshold=$args{'t'};
my $min_ln=$args{'l'};
my $ambiguity_treshold=$args{'n'};
my $max_ambiguity_pct=$args{'a'};
my $max_single_letter_pct=$args{'s'};
my $max_two_letters_pct=$args{'d'};
my $verbose=$args{'v'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
$treshold ||= 15;
$min_ln ||= 150;
$max_ambiguity_pct ||= 4; 
$ambiguity_treshold ||= 5;
$max_single_letter_pct ||= 60;
$max_two_letters_pct ||= 80;



#main body of script
#####################


$verbose and print "Retrieving sequences from database\n";

my $start_time=time;

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

my %sequences=();
my %quality=();
my %id_trans=();

#read in the sequences that haven't been evaluated
my ($seq_id, $trim_id, $seq, $qual);

$stm = " select t.seq_id, t.trimmed_seq_id, t.sequence_data, q.quality_values from trimmed_sequence as t, trimmed_sequence_quality as q, quality_evaluation as e where t.trimmed_seq_id=q.trimmed_seq_id and t.trimmed_seq_id=e.trimmed_seq_id and e.qual_criteria_id='1'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$seq_id, \$trim_id, \$seq, \$qual);

while ($sth->fetch){
    $sequences{$trim_id}=$seq;
    $quality{$trim_id}=$qual;
    $id_trans{$trim_id}=$seq_id;
}


if ($verbose){
  runtime::runtime_print($start_time, "Sequence retrieval");
    print "Sequences retrieved, evaluating quality\n";
    $start_time=time;
}

my %pass_val=();

foreach (keys %sequences){

    my $length_seq=length $sequences{$_};
    my @qual=split(/\s+/,$quality{$_});

#check for average quality above 15, else fail it

#check for empty value for sequences that didn't survive quality trimming
    unless (@qual > 0){
	$pass_val{$_}='5';
	$verbose and print "$_ is too low quality\n";
	next;
    }

    my $total=0;
    my $i;
    for ($i=0;$i<@qual;$i++){
	$total+=$qual[$i];
    }
    my $avg_qual=$total/@qual;

    if($avg_qual <= $treshold){
	$pass_val{$_}='5';
	$verbose and print "$_ is too low quality\n";
	next;
    }

#check to see if seq > 150bp (excluding any polyA tail), else fail it

    my $useful_seq=$sequences{$_};
    $useful_seq=~s/[AN]+$//;
    if ( (length $useful_seq) < $min_ln){
	$pass_val{$_}='2';
	$verbose and print "$_ is too short\n";
	next;
    }

#check for less than 4% ambiguity else fail it
#change basecalls below a treshold (5 by default) to N for this check

    my $ambig_seq=$sequences{$_};
    my $qual_index=0;
    while($qual_index<@qual){
	if($qual[$qual_index] <= $ambiguity_treshold){
	    substr $ambig_seq, $qual_index, 1, 'N';
	}
	$qual_index++;
    }
    $ambig_seq=~tr/ACGT//d;
    my $ambig_len=length $ambig_seq;
    my $ambig_pct=($ambig_len / $length_seq) * 100;
    if ($ambig_pct > $max_ambiguity_pct){
#	print "ambig pct for $_ is $ambig_len / $length_seq ($ambig_pct):\n$ambig_seq\n";
	$pass_val{$_}='3';
	$verbose and print "$_ is too ambiguous\n";
	next;
    }

	
#check for low complexity sequence (after excluding poly-A tail)
    my %letter_count=();
    my $low_complexity;

    my @seq=split(//,$useful_seq);

    my $letter;
    foreach $letter (@seq){
	$letter_count{$letter}++;
    }
    
    my $first_letter;
    foreach $first_letter (keys %letter_count){
	if ($letter_count{$first_letter}>(($max_single_letter_pct/100) * int(@seq))){
	    $low_complexity='true';
	    last;
	}
	my $second_letter;
	foreach $second_letter (keys %letter_count){
	    $second_letter eq $first_letter and next;
	    if (($letter_count{$first_letter}+$letter_count{$second_letter})>(($max_two_letters_pct/100) * int(@seq))){
		$low_complexity='true';
		last;
	    }
	}
	$low_complexity and last;
    }
    
    
    if($low_complexity){
	$pass_val{$_}='7';
	$verbose and print "$_ is too low complexity\n";
	next;
    }

#all criteria passed;
    $pass_val{$_}='6';

}

if ($verbose){
  runtime::runtime_print($start_time, "Sequence evaluation");
    print "Sequences evaluated, loading data.\n";
    $start_time=time;
}

#update the database
foreach (keys %pass_val){

    $stm = "update quality_evaluation set qual_criteria_id='$pass_val{$_}' where trimmed_seq_id='$_'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


#do this for the floral project to track genbank submissions
    if ($project eq 'fgn'){
#if it passed, add its seq_id to the genbank submission list
	if ($pass_val{$_} == 6){
	    $stm = "insert into genbank_submission (seq_id) values ('$id_trans{$_}')";
	    $sth = $dbh->prepare($stm) 
		|| die "Can't prepare statement: $DBI::errstr";
	    $rv = $sth->execute
		|| die "Can't execute statement: $DBI::errstr";
	}
	
    }
}




db_link::disconnect_db($dbh);



 runtime::runtime_print($start_time, "Quality evaluation");


