#!/usr/bin/perl -w
use strict;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $in_dir=$args{'i'};
my $out_dir=$args{'o'};
my $contamination_target=$args{'c'};
my $verbose=$args{'v'};
#set defaults for io, script filename, etc
$in_dir ||= "/tmp/";
$out_dir ||="/tmp/";
$contamination_target ||="/soldb/pgn_data_processing/E.coli_K12/Ecoli_genome";

#uniform trailing slashes
$in_dir=~/\/$/ or $in_dir.='/';
$out_dir=~/\/$/ or $out_dir.='/';

my $seq_in_file=$in_dir."seq_to_eval.fasta";
my $qual_in_file=$in_dir."seq_to_eval.fasta.qual";
my $blast_result=$out_dir."seq_vs_ecoli.blast_clean_results";
my $passed_seq_file=$out_dir."ecoli_clean_seq.fasta";
my $passed_qual_file=$out_dir."ecoli_clean_seq.fasta.qual";
my $contaminated_seq_file=$out_dir."ecoli_contaminated_seq.fasta";
my $contaminated_qual_file=$out_dir."ecoli_contaminated_seq.fasta.qual";
my $blast_script="./batch_blast_generic.pl";


#main body of script
#####################


#do the blast vs ecoli
my $syscmd="$blast_script -i $seq_in_file -o $blast_result -t $contamination_target -p blastn -d f -e 1e-100";

system($syscmd);


open FILEIN, $blast_result;

my %matches=();
my $query;
while (<FILEIN>){
    /^\s*Query= ([0-9]+)/
	and $query = $1;
    /^\s*Sequences/
	and $matches{$query} = 1;
}

close FILEIN;



#read in all the sequences
open FILEIN, "$seq_in_file";
my %seq_hash=();
my ($defline, $dataline);
while (<FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
        if ($defline){
            $seq_hash{$defline}=$dataline;
        }
        $defline=$1;
        $dataline='';
        next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$seq_hash{$defline}=$dataline;

close FILEIN;


#read in all the sequence quality values
open FILEIN, "$qual_in_file";
my %qual_hash=();
while (<FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
        if ($defline){
            $qual_hash{$defline}=$dataline;
        }
        $defline=$1;
        $dataline='';
        next;
    }

    s/^\s*([^\s]+)\s*$/$1/;
    $dataline.=$_;
}
$qual_hash{$defline}=$dataline;

close FILEIN;


#print out to files the results
open CLEAN_FILE, ">$passed_seq_file";
open CLEAN_QUAL_FILE, ">$passed_qual_file";
open ECOLI_FILE, ">$contaminated_seq_file";
open ECOLI_QUAL_FILE, ">$contaminated_qual_file";

foreach (keys %seq_hash){
    if($matches{$_}){
	print ECOLI_FILE ">$_ - ecoli contamination\n$seq_hash{$_}\n";
	print ECOLI_QUAL_FILE ">$_ - ecoli contamination\n$qual_hash{$_}\n";
    }
    else{
	print CLEAN_FILE ">$_\n$seq_hash{$_}\n";
	print CLEAN_QUAL_FILE ">$_\n$qual_hash{$_}\n";
    }
}

close CLEAN_FILE;
close CLEAN_QUAL_FILE;
close ECOLI_FILE;
close ECOLI_QUAL_FILE;



$verbose and print "E.coli screen done.\n";
