#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/soldb/www/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $in_dir=$args{'i'};
my $project=$args{'p'};
my $target=$args{'t'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$target or die "Please specify the blast target with the -t flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#set defaults for io, script filename, etc
#$in_dir ||= "/soldb/pgn_data_processing/blast_results";
$in_dir ||= "/data/shared/dan/${project}_blast/results/";

#uniform trailing slashes
$in_dir=~/\/$/ or $in_dir.='/';

my $clean_blast_results_file=$in_dir.$project."_vs_".$target."_clean_blast_results.txt";

my $start_time=time;

open (FILEIN, $clean_blast_results_file) or die "Couldn't open $clean_blast_results_file\n";
my %seq_blast_result=();
my ($trimmed_seq_id,$match_count, $match_lines);

my @match_line;
my $match_info;
while (<FILEIN>){

    chomp;

#skip separator and non-match lines
    /^[\s-]+$/ and next;

#skip no hits
    /No hits/ and next;

#skip sequence match indicator
    /^Sequence/ and next;


    if (/^Query=\s*([0-9]+)/){
	$trimmed_seq_id=$1;
	$match_count=0;
	@match_line=();
	next;
    }

    ($match_count > 10) and next;


    if(/^>(.+)$/){
	$match_count++;
	$match_info=$1;
	my $defline;
	until(($defline=<FILEIN>)=~/^\s+Length = [0-9]+\s/){
	    $defline=~s/\n//g;
#shrink spaces
	    $defline=~s/\s{2,}/ /g;
	    $match_info .= $defline;
	}
#escape ' and "
	$match_info=~s/([\'\"])/\\$1/g;

	push @match_line, $match_info;

#######DEBUG#########
#	print "SEQID=$trimmed_seq_id; MATCHNR=$match_count; DEFLINE: $match_line[0]\n";

	next;
    }


    if(/\s*Score/){

	my($S, $E) = split /,/;

	$S=~s/^.+=(.+)bits.+$/$1/;
	$S=~tr/ //d;
	$E=~s/^.+=(.+)$/$1/;
	$E=~tr/ //d;
#change the screwy evalues into actual numbers
	$E=~s/^e/1e/;

	push @match_line, $S, $E;

#######DEBUG#########
#	print "SEQID=$trimmed_seq_id; MATCHNR=$match_count; SCORE: $match_line[1]; EVAL: $match_line[2]\n";

	next;
    }

    if (/^\s*Identities[^\(]+\(([0-9]+)%\)/){
	push @match_line, $1;

########DEBUG########
#	print "SEQID=$trimmed_seq_id; MATCHNR=$match_count; IDENT_PCT: $match_line[3]";

	next;
    }

    if (/^\s*Frame\s+=\s+([0-9+-]+)\s*/){
	push @match_line, $1;

########DEBUG#########
#	print "SEQID=$trimmed_seq_id; MATCHNR=$match_count; FRAME: $1\n";

    }


#load the results from the match
    if($match_line[0]){
	push @{$seq_blast_result{$trimmed_seq_id}}, [ @match_line ];
	@match_line=();
    }
		 
}


close FILEIN;

runtime::runtime_print($start_time,"Blast result parsing");

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#get the target id
my $target_id;
$stm="select blast_target_id from blast_target where target_name='$target'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$target_id);
$sth->fetch or die "Can't find target id for $target";



#remove previous entries
    $stm="delete from blast_result where blast_target_id='$target_id'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


#get the sequence ids
my %seq_lookup=();
my $matches=0;
my $seq_id;
$stm="select seq_id, trimmed_seq_id from trimmed_sequence";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$seq_id, \$trimmed_seq_id);
while($sth->fetch){
    if($seq_blast_result{$trimmed_seq_id}){
	$seq_lookup{$trimmed_seq_id}=$seq_id;
	$matches+=int(@{$seq_blast_result{$trimmed_seq_id}});
    }
}


print "$matches sequence matches found, loading them to the database.\n";

foreach $trimmed_seq_id (keys %seq_blast_result){

    my $j=0;
    foreach (@{$seq_blast_result{$trimmed_seq_id}}){
	$j++;

#	print "SEQID:$trimmed_seq_id\nCOMMENT:$$_[0]\nSCORE:$$_[1]\nEVAL:$$_[2]\nIDENT:$$_[3]\nFRAME:$$_[4]\n";

	my $i=0;
	for ($i=0; $i<5;$i++){
	    (defined $$_[$i]) or die "No $i for $trimmed_seq_id on the $j-th match\n";
	} 

#insert the new results
	$stm="insert into blast_result (seq_id, trimmed_seq_id, blast_target_id, match_description, match_score, evalue, identity_pct, frame) values ('$seq_lookup{$trimmed_seq_id}', '$trimmed_seq_id', '$target_id', '$$_[0]', '$$_[1]', '$$_[2]', '$$_[3]', '$$_[4]')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    }
}

db_link::disconnect_db($dbh);
