#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use db_link;
use projects;
use runtime;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $lib=$args{'l'};
my $out_file=$args{'o'};
my $project=$args{'p'};


$lib or die "No library specified, try '-l library_name (-l all to process all libraries)'\n";
$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
$out_file ||= "/tmp/${lib}_gc_content.txt";


#main body of script
#####################

my $start_time=time;

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#pull all sequences and relevant information for this library
my %seq;

$stm = "select ts.seq_id, ts.sequence_data from trimmed_sequence as ts, est_info as e, quality_evaluation as q";

unless ($lib eq 'all'){
    $stm .=", est_library as l";
}

 $stm.=" where ts.seq_id=e.seq_id and ts.trimmed_seq_id=q.trimmed_seq_id and q.qual_criteria_id='6'";

unless ($lib eq 'all'){
    $stm.=" and e.est_library_id=l.est_library_id and l.library_name='$lib'";
}

$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
my ($seq_id,$seq_data);
$rc = $sth->bind_columns(\$seq_id,\$seq_data);

while ($sth->fetch){
    $seq{$seq_id}=$seq_data; 
}

print "Got " . int(keys %seq) . " sequences\n";

my %gc_pct=();
my %seq_ln=();
foreach (keys %seq){

    my $gc=0;
    my $ln= length $seq{$_};
    my @nucleotide = split //, $seq{$_};
    my $nc;
    foreach $nc (@nucleotide){
	($nc eq 'G' or $nc eq 'C') and $gc++;
    }
    $gc=$gc/$ln;

    $gc_pct{$_}=$gc;
    $seq_ln{$_}=$ln;
}


foreach (keys %seq){

    $stm= "insert into est_metadata (seq_id, gc_content, trimmed_length) values ('$_', '$gc_pct{$_}', '$seq_ln{$_}')";    
    $sth = $dbh->prepare($stm)
        || die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
        || die "Can't execute statement: $DBI::errstr";

}


#close the database link
db_link::disconnect_db($dbh);

