#!/usr/bin/perl -w
use strict;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
  
  $_ or next;
  my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}


my $in_dir=$args{'i'};
my $out_dir=$args{'o'};
my $treshold=$args{'t'};
my $min_ln=$args{'l'};
my $ambiguity_treshold=$args{'n'};
my $max_ambiguity_pct=$args{'a'};
my $max_single_letter_pct=$args{'s'};
my $max_two_letters_pct=$args{'d'};
my $verbose=$args{'v'};


#set defaults for io, script filename, etc
$treshold ||= 15;
$min_ln ||= 150;
$max_ambiguity_pct ||= 4; 
$ambiguity_treshold ||= 5;
$max_single_letter_pct ||= 60;
$max_two_letters_pct ||= 80;
$in_dir ||= "/tmp/";
$out_dir ||="/tmp/";

#uniform trailing slashes
$in_dir=~/\/$/ or $in_dir.='/';
$out_dir=~/\/$/ or $out_dir.='/';

my $seq_in_file=$in_dir."ecoli_clean_seq.fasta";
my $qual_in_file=$in_dir."ecoli_clean_seq.fasta.qual";
my $passed_seq_file=$out_dir."qual_passed_seq.fasta";
my $passed_qual_file=$out_dir."qual_passed_seq.fasta.qual";
my $failed_seq_file=$out_dir."qual_failed_seq.fasta";
my $failed_qual_file=$out_dir."qual_failed_seq.fasta.qual";
#main body of script
#####################


$verbose and print "Reading sequences from file\n";

my %sequences=();
my %quality=();
my ($defline, $dataline);

#read in all the sequences
open FILEIN, "$seq_in_file";

while (<FILEIN>){

    /^\s*$/ and next;

    chomp;
    if (/^>([^\s]+)\s*/){ 
        if ($defline){
            $sequences{$defline}=$dataline;
        }
        $defline=$1;
        $dataline='';
        next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$sequences{$defline}=$dataline;

close FILEIN;


#read in all the sequence quality values
open FILEIN, "$qual_in_file";
while (<FILEIN>){
    chomp;
    if (/^>([^\s]+)\s*/){ 
        if ($defline){
            $quality{$defline}=$dataline;
        }
        $defline=$1;
        $dataline='';
        next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$quality{$defline}=$dataline;

close FILEIN;


$verbose and print "Sequences read, checking quality.\n";



my %qual_lookup = ();
$qual_lookup{'1'} = 'not evaluated';
$qual_lookup{'2'} = 'too short';
$qual_lookup{'3'} = 'too ambiguous';
$qual_lookup{'4'} = 'E.coli contamination';
$qual_lookup{'5'} = 'low overall quality';
$qual_lookup{'6'} = 'passed';
$qual_lookup{'7'} = 'low complexity';


my %pass_val=();

foreach (keys %sequences){

    my $length_seq=length $sequences{$_};
    my @qual=split(/\s+/,$quality{$_});

#check for average quality above 15, else fail it

#check for empty value for sequences that didn't survive quality trimming
    unless (@qual > 0){
	$pass_val{$_}='5';
	next;
    }

    my $total=0;
    my $i;
    for ($i=0; $i<@qual; $i++){
	$total+=$qual[$i];
    }
    my $avg_qual=$total/@qual;

    if($avg_qual <= $treshold){
	$pass_val{$_}='5';
	next;
    }

#check to see if seq > 150bp (excluding any polyA tail), else fail it

    my $useful_seq=$sequences{$_};
    $useful_seq=~s/[AN]+$//;
    if ( (length $useful_seq) < $min_ln){
	$pass_val{$_}='2';
	next;
    }

#check for less than 4% ambiguity else fail it
#change basecalls below a treshold (5 by default) to N for this check

    my $ambig_seq=$sequences{$_};
    my $qual_index=0;
    while($qual_index<@qual){
	if($qual[$qual_index] <= $ambiguity_treshold){
	    substr $ambig_seq, $qual_index, 1, 'N';
	}
	$qual_index++;
    }
    $ambig_seq=~tr/ACGT//d;
    my $ambig_len=length $ambig_seq;
    my $ambig_pct=($ambig_len / $length_seq) * 100;
    if ($ambig_pct > $max_ambiguity_pct){
#	print "ambig pct for $_ is $ambig_len / $length_seq ($ambig_pct):\n$ambig_seq\n";
	$pass_val{$_}='3';
	next;
    }

	
#check for low complexity sequence (after excluding poly-A tail)
    my %letter_count=();
    my $low_complexity;

    my @seq=split(//,$useful_seq);

    my $letter;
    foreach $letter (@seq){
	$letter_count{$letter}++;
    }
    
    my $first_letter;
    foreach $first_letter (keys %letter_count){
	if ($letter_count{$first_letter}>(($max_single_letter_pct/100) * int(@seq))){
	    $low_complexity='true';
	    last;
	}
	my $second_letter;
	foreach $second_letter (keys %letter_count){
	    $second_letter eq $first_letter and next;
	    if (($letter_count{$first_letter}+$letter_count{$second_letter})>(($max_two_letters_pct/100) * int(@seq))){
		$low_complexity='true';
		last;
	    }
	}
	$low_complexity and last;
    }
    
    
    if($low_complexity){
	$pass_val{$_}='7';
	next;
    }

#all criteria passed;
    $pass_val{$_}='6';

}

if ($verbose){
    print "Sequences evaluated, writing data.\n";
}

open FAILED_SEQ_FILE, ">$failed_seq_file";
open FAILED_QUAL_FILE, ">$failed_qual_file";
open PASSED_SEQ_FILE, ">$passed_seq_file";
open PASSED_QUAL_FILE, ">$passed_qual_file";

foreach (keys %sequences){
    if ($pass_val{$_}==6){
	print PASSED_SEQ_FILE ">$_\n$sequences{$_}\n";
	print PASSED_QUAL_FILE ">$_\n$quality{$_}\n";
    }
    else{
	print FAILED_SEQ_FILE ">$_ - $qual_lookup{$pass_val{$_}}\n$sequences{$_}\n";
	print FAILED_QUAL_FILE ">$_ - $qual_lookup{$pass_val{$_}}\n$quality{$_}\n";
    }
}

close FAILED_SEQ_FILE;
close FAILED_QUAL_FILE;
close PASSED_SEQ_FILE;
close PASSED_QUAL_FILE;


$verbose and print "Quality evaluation done\n";
