#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/soldb/www/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $out_dir=$args{'o'};
my $project=$args{'p'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
$out_dir ||= "/tmp/";

#uniform trailing slashes
$out_dir=~/\/$/ or $out_dir.='/';

my $seq_out_file=$out_dir. $project . "_all_passed_seq.fasta";
my $qual_out_file=$out_dir. $project . "_all_passed_seq.fasta.qual";

my $script_file=__FILE__;


#main body of script
#####################

my $start_time=time;

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#pull all sequences and relevant information for this library
my ($seq_id, $seq, $quality, $read_direction, $chemistry, $dye, $clone);
my %sequences=();

$stm = "select ts.trimmed_seq_id, ts.sequence_data, tq.quality_values, e.read_direction, s.chemistry, s.dye, o.external_id from trimmed_sequence as ts, trimmed_sequence_quality as tq, est_info as e, sequencing_information as s, other_identifier as o, quality_evaluation as q where ts.trimmed_seq_id=tq.trimmed_seq_id and ts.seq_id=e.seq_id and e.sequencing_info_id=s.sequencing_info_id and ts.seq_id=o.local_db_id and ts.trimmed_seq_id=q.trimmed_seq_id and q.qual_criteria_id='6'";

$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$seq_id, \$seq, \$quality, \$read_direction, \$chemistry, \$dye, \$clone);

while ($sth->fetch){
    push @{$sequences{$seq_id}}, ($seq, ,$quality, $read_direction, $chemistry, $dye, $clone); 
}

#close the database link
db_link::disconnect_db($dbh);


#write out the files for phrap
open (SEQ_OUT, ">$seq_out_file");
open (QUAL_OUT, ">$qual_out_file");

#make the lookup tables for the expected nomenclature, see phrap.doc for details
my %direction_table=();
$direction_table{'5'}='fwd';
$direction_table{'3'}='rev';

my %chem_table=();
$chem_table{'terminator'}='term';
$chem_table{'primer'}='prim';
$chem_table{'unknown'}='unknown';

my %dye_table=();
$dye_table{'rhodamine'}='rhod';
$dye_table{'d-rhodamine'}='d-rhod';
$dye_table{'big-dye'}='big';
$dye_table{'energy-transfer'}='ET';
$dye_table{'bodipy'}='bodipy';
$dye_table{'unknown'}='unknown';



print "Found ".int(keys %sequences)." sequences.\n";

foreach (keys %sequences){


    my $defline=">$sequences{$_}[5]-$_ DIRECTION: $direction_table{$sequences{$_}[2]}  CHEM: $chem_table{$sequences{$_}[3]}  DYE: $dye_table{$sequences{$_}[4]}  TEMPLATE: $sequences{$_}[5]\n"; 
    
    print SEQ_OUT $defline . $sequences{$_}[0] . "\n";
    print QUAL_OUT $defline . $sequences{$_}[1] . "\n";
    
}

close SEQ_OUT;
close QUAL_OUT;

#run phrap
my $phrap_results_file= $out_dir . $project . "_unigene_phrap.out";
my $syscmd="phrap $seq_out_file -new_ace > $phrap_results_file";

system($syscmd);

 runtime::runtime_print($start_time, "Unigene assembly");
my $time=time;

#extract assembly information
my $align_file=$out_dir . $project . "_unigene.alignment";


my %unigene_align=();
my ($cluster, $contig, $est);


open FILEIN, "$phrap_results_file" or die "Couldn't open file $phrap_results_file\n";
my ($readline, $vector_read);
my %vector=();

foreach (<FILEIN>){

#skip empty lines
    /^\s*$/ 
	and next;	
    
#turn off seqtrim_read after reading the EST trimming info
    /^Near duplicate reads/ 
	and $vector_read=0
	    and next;
    
#start reading EST possible vector info
    /^Probable unremoved sequencing vector/ 
	and $vector_read=1
	    and next;
    
#parse EST possible vector info
    $vector_read
	and /^[a-zA-Z0-9\.-]+-([0-9]+)\s+([0-9]+)-([0-9]+)\s+([a-zA-Z]+)/
	    and $vector{$1}=[$2, $3, $4]
		and next;
    
#turn off readline after finishing reading the components
    (/^Contig quality/ or /^Overall/)
	and $readline=0
	    and next;
    
#read the cluster summary and turn on readline
    /^Contig ([0-9]+)\.\s+([0-9]+) read[s]{0,1}; ([0-9]+) bp.+,\s+([0-9]+) / 
	and $contig=$1
	    and $unigene_align{$project}{$contig}{summary}=[$2, $3, $4]
		and $readline=1
		    and next;
    
#skip comment lines
    /^\s*\*+\s+/
	and next;
    
#read the member ESTs, start and stop positions
#NOTE:  It is safe to assume that all the trimming info
#       has been read before reaching the contig info
    
    if ($readline){
	tr/\)\(//d;
	/^\s*C/ or substr($_, 0, 0)='N';
	my ($reverse, $start, $end, $est, $score, $othermatch, $pct1, $pct2, $pct3, $front_trim, $front_trim_match, $back_trim, $back_trim_match)=split;
	my $direction='u';
	$est=~/^.+\.([bg])/
	    and $direction=$1;
	$est=~/^(.+)-([0-9]+)$/
	    and my $cid=$1
		and my $seqid=$2;
	$cid=~s/\.[bg]$//;
	my ($vector_start, $vector_end, $vector_seq)=(0,0, '');
	$vector{$seqid} 
	and $vector_start=$vector{$seqid}[0] 
	    and $vector_end=$vector{$seqid}[1]
		and $vector_seq=$vector{$seqid}[2];
	$unigene_align{$project}{$contig}{$seqid}=[$cid, $reverse, $direction, $start, $end, $front_trim, $back_trim, $vector_start, $vector_end, $vector_seq];
	next;
    }
    
}

close FILEIN;





open FILEOUT, ">$align_file";

print FILEOUT "
#Parsed contig info from phrap standard output
#Format:
#
#lib name
#*tab*contig nr, reads nr, contig length (untrimmed), contig length (trimmed)
#*tab**tab*est nr, est cornell id, reversed sequence (C=yes, N=no),
#direction (b=5\', g=3\'), start location within contig, end location
#within contig, seq trimmed for assemby at start, seq trimmed at end, 
#possible vector start, possible vector end, possible vector sequence\n
";

my $proj;

foreach $proj (keys %unigene_align){
    $proj or (print "Empty project\n" and next);
    print FILEOUT "Project $proj\n";
    foreach $contig (keys %{$unigene_align{$proj}}){
	$contig or (print "Empty contig id in $proj\n" and next);
	print FILEOUT "\tContig$contig, ", join (', ', @{$unigene_align{$proj}{$contig}{summary}}), "\n";
	foreach $est (keys %{$unigene_align{$proj}{$contig}}){
	    $est or (print "Empty est id in $contig of $proj\n" and next);
	    $est ne 'summary'
		and print FILEOUT "\t\t$est, ", join (', ', @{$unigene_align{$proj}{$contig}{$est}}), "\n";
	}
    }
}

close FILEOUT;

runtime::runtime_print($time, 'Contig membership alignment extraction');

runtime::runtime_print($start_time, "Unigene assembly");

