#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use runtime;
use db_link;
use projects;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $seq_in_file=$args{'i'};
my $qual_in_file=$args{'q'};
my $project=$args{'p'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};

$db or die "No known database for project $project";

#set defaults for io, script filename, etc
$seq_in_file ||= "/tmp/seqs_fasta.polytrim";
$qual_in_file ||= "/tmp/seqs_fasta.polytrim.qual";


#main body of script
#####################

my $start_time=time;

#read in and hash the sequence and quality files
my %sequences=();
my %quality=();

my ($defline, $dataline)=('','');
open FILEIN, $seq_in_file;

foreach (<FILEIN>){
    chomp;
    if (/^>([^\s]+)[\s\n]/){ 
	if ($defline){
	    $sequences{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }
    $dataline.=$_;
}

$sequences{$defline}=$dataline;

close FILEIN;

open FILEIN, $qual_in_file;
foreach (<FILEIN>){
    chomp;
    if (/^>([^\s]+)[\s\n]/){ 
	if ($defline){
	    $quality{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }
    $dataline.=$_;
}
$quality{$defline}=$dataline;

close FILEIN;



# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#get a list of existing trimmed sequences 
my %already_trimmed=();
my $trimmed_id;
$stm="select seq_id from trimmed_sequence";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$trimmed_id);
while($sth->fetch){
    $already_trimmed{$trimmed_id}++;
}


my $seq_id;
foreach $seq_id (keys %sequences){

#calculate metadata (gc content and length)
    my $gc=0;
    my $ln= length $sequences{$seq_id};
    my @nucleotide = split //, $sequences{$seq_id};
    my $nc;
    foreach $nc (@nucleotide){
	($nc eq 'G' or $nc eq 'C') and $gc++;
    }
    $ln and $gc=$gc/$ln;


#get seq format and type 
    my($type, $format);
    $stm="select data_type, data_format from raw_sequence where seq_id='$seq_id' ";
    $sth = $dbh->prepare($stm) 
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$type, \$format);
    $sth->fetch;

#insert or update sequences

    if ($already_trimmed{$seq_id}){
	$stm = "update trimmed_sequence set sequence_data='$sequences{$seq_id}', data_type='$type', data_format='$format' where seq_id='$seq_id'";
    }
    else{
	$stm = "insert into trimmed_sequence (seq_id, sequence_data, data_type, data_format) values ('$seq_id', '$sequences{$seq_id}', '$type', '$format')";
    }
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get trimmed seq id
    my $trimmed_seqid;
    $stm="select trimmed_seq_id from trimmed_sequence where seq_id='$seq_id' ";
    $sth = $dbh->prepare($stm) 
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$trimmed_seqid);
    $sth->fetch;


#insert quality values

    if ($already_trimmed{$seq_id}){
	$stm = "update trimmed_sequence_quality set trimmed_seq_id='$trimmed_seqid', quality_values='$quality{$seq_id}' where seq_id='$seq_id'";
    }
    else{
	$stm = "insert into trimmed_sequence_quality (seq_id, trimmed_seq_id, quality_values) values ('$seq_id', '$trimmed_seqid', '$quality{$seq_id}' )";
    }
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#set quality evaluation to unchecked
$stm = "insert into quality_evaluation (seq_id, trimmed_seq_id, qual_criteria_id) values ('$seq_id', '$trimmed_seqid', '1')"; 
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


#insert computed meta_data
#currently computing gc content and trimmed length
    $stm= "insert into est_metadata (seq_id, gc_content, trimmed_length) values ('$seq_id', '$gc', '$ln')";    
    $sth = $dbh->prepare($stm)
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";




}

#do a check to make sure we are inserting a trimmed sequence for each raw one
my ($raw_id, $trim_id);
$stm="select r.seq_id, t.seq_id from raw_sequence as r left join trimmed_sequence as t on r.seq_id=t.seq_id where t.seq_id is null";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$raw_id,\$trim_id);
while ($sth->fetch){
    print "WARNING: Sequence $raw_id lost during trimming!\n";
}


 db_link::disconnect_db($dbh);


 runtime::runtime_print($start_time, "Trimmed sequence upload");


